Yuan.Wang
1 year ago
7 changed files with 0 additions and 2367 deletions
@ -1,539 +0,0 @@ |
|||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
import re |
|
||||||
|
|
||||||
# A simple python script to generate an HTML entity map and a regex alternation |
|
||||||
# for inclusion in HTMLStripCharFilter.jflex. |
|
||||||
|
|
||||||
def main(): |
|
||||||
print get_apache_license() |
|
||||||
codes = {} |
|
||||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"') |
|
||||||
for line in get_entity_text().split('\n'): |
|
||||||
match = regex.match(line) |
|
||||||
if match: |
|
||||||
key = match.group(1) |
|
||||||
if key == 'quot': codes[key] = r'\"' |
|
||||||
elif key == 'nbsp': codes[key] = ' '; |
|
||||||
else : codes[key] = r'\u%04X' % int(match.group(2)) |
|
||||||
|
|
||||||
keys = sorted(codes) |
|
||||||
|
|
||||||
first_entry = True |
|
||||||
output_line = 'CharacterEntities = ( ' |
|
||||||
for key in keys: |
|
||||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key |
|
||||||
first_entry = False |
|
||||||
if len(output_line) + len(new_entry) >= 80: |
|
||||||
print output_line |
|
||||||
output_line = ' ' |
|
||||||
output_line += new_entry |
|
||||||
if key in ('quot','copy','gt','lt','reg','amp'): |
|
||||||
new_entry = ' | "%s"' % key.upper() |
|
||||||
if len(output_line) + len(new_entry) >= 80: |
|
||||||
print output_line |
|
||||||
output_line = ' ' |
|
||||||
output_line += new_entry |
|
||||||
print output_line, ')' |
|
||||||
|
|
||||||
print '%{' |
|
||||||
print ' private static final Map<String,String> upperCaseVariantsAccepted' |
|
||||||
print ' = new HashMap<String,String>();' |
|
||||||
print ' static {' |
|
||||||
print ' upperCaseVariantsAccepted.put("quot", "QUOT");' |
|
||||||
print ' upperCaseVariantsAccepted.put("copy", "COPY");' |
|
||||||
print ' upperCaseVariantsAccepted.put("gt", "GT");' |
|
||||||
print ' upperCaseVariantsAccepted.put("lt", "LT");' |
|
||||||
print ' upperCaseVariantsAccepted.put("reg", "REG");' |
|
||||||
print ' upperCaseVariantsAccepted.put("amp", "AMP");' |
|
||||||
print ' }' |
|
||||||
print ' private static final CharArrayMap<Character> entityValues' |
|
||||||
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys) |
|
||||||
print ' static {' |
|
||||||
print ' String[] entities = {' |
|
||||||
output_line = ' ' |
|
||||||
for key in keys: |
|
||||||
new_entry = ' "%s", "%s",' % (key, codes[key]) |
|
||||||
if len(output_line) + len(new_entry) >= 80: |
|
||||||
print output_line |
|
||||||
output_line = ' ' |
|
||||||
output_line += new_entry |
|
||||||
print output_line[:-1] |
|
||||||
print ' };' |
|
||||||
print ' for (int i = 0 ; i < entities.length ; i += 2) {' |
|
||||||
print ' Character value = entities[i + 1].charAt(0);' |
|
||||||
print ' entityValues.put(entities[i], value);' |
|
||||||
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);' |
|
||||||
print ' if (upperCaseVariant != null) {' |
|
||||||
print ' entityValues.put(upperCaseVariant, value);' |
|
||||||
print ' }' |
|
||||||
print ' }' |
|
||||||
print " }" |
|
||||||
print "%}" |
|
||||||
|
|
||||||
def get_entity_text(): |
|
||||||
# The text below is taken verbatim from |
|
||||||
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>: |
|
||||||
text = r""" |
|
||||||
F.1. XHTML Character Entities |
|
||||||
|
|
||||||
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section. |
|
||||||
F.1.1. XHTML Latin 1 Character Entities |
|
||||||
|
|
||||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent. |
|
||||||
|
|
||||||
<!-- ...................................................................... --> |
|
||||||
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ --> |
|
||||||
<!-- file: xhtml-lat1.ent |
|
||||||
|
|
||||||
Typical invocation: |
|
||||||
|
|
||||||
<!ENTITY % xhtml-lat1 |
|
||||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
|
||||||
"xhtml-lat1.ent" > |
|
||||||
%xhtml-lat1; |
|
||||||
|
|
||||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
|
||||||
|
|
||||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
|
||||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent" |
|
||||||
|
|
||||||
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
|
||||||
|
|
||||||
Portions (C) International Organization for Standardization 1986: |
|
||||||
Permission to copy in any form is granted for use with conforming |
|
||||||
SGML systems and applications as defined in ISO 8879, provided |
|
||||||
this notice is included in all copies. |
|
||||||
--> |
|
||||||
|
|
||||||
<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum --> |
|
||||||
<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum --> |
|
||||||
<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum --> |
|
||||||
<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum --> |
|
||||||
<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum --> |
|
||||||
<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum --> |
|
||||||
<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum --> |
|
||||||
<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum --> |
|
||||||
<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia --> |
|
||||||
<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum --> |
|
||||||
<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum --> |
|
||||||
<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum --> |
|
||||||
<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum --> |
|
||||||
<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum --> |
|
||||||
<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum --> |
|
||||||
<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia --> |
|
||||||
<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum --> |
|
||||||
<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum --> |
|
||||||
<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum --> |
|
||||||
<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum --> |
|
||||||
<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia --> |
|
||||||
<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum --> |
|
||||||
<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum --> |
|
||||||
<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum --> |
|
||||||
<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia --> |
|
||||||
<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum --> |
|
||||||
<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum --> |
|
||||||
<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum --> |
|
||||||
<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum --> |
|
||||||
<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum --> |
|
||||||
<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum --> |
|
||||||
<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum --> |
|
||||||
<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 --> |
|
||||||
<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 --> |
|
||||||
<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 --> |
|
||||||
<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 --> |
|
||||||
<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 --> |
|
||||||
<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 --> |
|
||||||
<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 --> |
|
||||||
<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 --> |
|
||||||
<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 --> |
|
||||||
<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 --> |
|
||||||
<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 --> |
|
||||||
<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 --> |
|
||||||
<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 --> |
|
||||||
<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 --> |
|
||||||
<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 --> |
|
||||||
<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 --> |
|
||||||
<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 --> |
|
||||||
<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 --> |
|
||||||
<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 --> |
|
||||||
<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 --> |
|
||||||
<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 --> |
|
||||||
<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 --> |
|
||||||
<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 --> |
|
||||||
<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum --> |
|
||||||
<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 --> |
|
||||||
<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 --> |
|
||||||
<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 --> |
|
||||||
<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 --> |
|
||||||
<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 --> |
|
||||||
<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 --> |
|
||||||
<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 --> |
|
||||||
<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 --> |
|
||||||
<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 --> |
|
||||||
<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 --> |
|
||||||
<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 --> |
|
||||||
<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 --> |
|
||||||
<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 --> |
|
||||||
<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 --> |
|
||||||
<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 --> |
|
||||||
<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 --> |
|
||||||
<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 --> |
|
||||||
<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 --> |
|
||||||
<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 --> |
|
||||||
<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 --> |
|
||||||
<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 --> |
|
||||||
<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 --> |
|
||||||
<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 --> |
|
||||||
<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 --> |
|
||||||
<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 --> |
|
||||||
<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 --> |
|
||||||
<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 --> |
|
||||||
<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 --> |
|
||||||
<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 --> |
|
||||||
<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 --> |
|
||||||
<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 --> |
|
||||||
<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum --> |
|
||||||
<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 --> |
|
||||||
<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 --> |
|
||||||
<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 --> |
|
||||||
<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 --> |
|
||||||
<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 --> |
|
||||||
<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 --> |
|
||||||
<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 --> |
|
||||||
<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 --> |
|
||||||
<!-- end of xhtml-lat1.ent --> |
|
||||||
|
|
||||||
F.1.2. XHTML Special Characters |
|
||||||
|
|
||||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent. |
|
||||||
|
|
||||||
<!-- ...................................................................... --> |
|
||||||
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ --> |
|
||||||
<!-- file: xhtml-special.ent |
|
||||||
|
|
||||||
Typical invocation: |
|
||||||
|
|
||||||
<!ENTITY % xhtml-special |
|
||||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
|
||||||
"xhtml-special.ent" > |
|
||||||
%xhtml-special; |
|
||||||
|
|
||||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
|
||||||
|
|
||||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
|
||||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent" |
|
||||||
|
|
||||||
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
|
||||||
|
|
||||||
Portions (C) International Organization for Standardization 1986: |
|
||||||
Permission to copy in any form is granted for use with conforming |
|
||||||
SGML systems and applications as defined in ISO 8879, provided |
|
||||||
this notice is included in all copies. |
|
||||||
|
|
||||||
Revisions: |
|
||||||
2000-10-28: added ' and altered XML Predefined Entities for compatibility |
|
||||||
--> |
|
||||||
|
|
||||||
<!-- Relevant ISO entity set is given unless names are newly introduced. |
|
||||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
|
||||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
|
||||||
numbers are given for each character, in hex. Entity values are |
|
||||||
decimal conversions of the ISO 10646 values and refer to the |
|
||||||
document character set. Names are Unicode [UNICODE] names. |
|
||||||
--> |
|
||||||
|
|
||||||
<!-- C0 Controls and Basic Latin --> |
|
||||||
<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum --> |
|
||||||
<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum --> |
|
||||||
<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum --> |
|
||||||
<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum --> |
|
||||||
<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum --> |
|
||||||
|
|
||||||
<!-- Latin Extended-A --> |
|
||||||
<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 --> |
|
||||||
<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 --> |
|
||||||
|
|
||||||
<!-- ligature is a misnomer, this is a separate character in some languages --> |
|
||||||
<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 --> |
|
||||||
<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 --> |
|
||||||
<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 --> |
|
||||||
|
|
||||||
<!-- Spacing Modifier Letters --> |
|
||||||
<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub --> |
|
||||||
<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia --> |
|
||||||
|
|
||||||
<!-- General Punctuation --> |
|
||||||
<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub --> |
|
||||||
<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub --> |
|
||||||
<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub --> |
|
||||||
<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 --> |
|
||||||
<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 --> |
|
||||||
<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 --> |
|
||||||
<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 --> |
|
||||||
<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub --> |
|
||||||
<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub --> |
|
||||||
<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum --> |
|
||||||
<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum --> |
|
||||||
<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW --> |
|
||||||
<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum --> |
|
||||||
<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum --> |
|
||||||
<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW --> |
|
||||||
<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub --> |
|
||||||
<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub --> |
|
||||||
<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech --> |
|
||||||
|
|
||||||
<!-- lsaquo is proposed but not yet ISO standardized --> |
|
||||||
<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed --> |
|
||||||
<!-- rsaquo is proposed but not yet ISO standardized --> |
|
||||||
<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed --> |
|
||||||
<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW --> |
|
||||||
|
|
||||||
<!-- end of xhtml-special.ent --> |
|
||||||
|
|
||||||
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters |
|
||||||
|
|
||||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent. |
|
||||||
|
|
||||||
<!-- ...................................................................... --> |
|
||||||
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... --> |
|
||||||
<!-- file: xhtml-symbol.ent |
|
||||||
|
|
||||||
Typical invocation: |
|
||||||
|
|
||||||
<!ENTITY % xhtml-symbol |
|
||||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
|
||||||
"xhtml-symbol.ent" > |
|
||||||
%xhtml-symbol; |
|
||||||
|
|
||||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
|
||||||
|
|
||||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
|
||||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent" |
|
||||||
|
|
||||||
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
|
||||||
|
|
||||||
Portions (C) International Organization for Standardization 1986: |
|
||||||
Permission to copy in any form is granted for use with conforming |
|
||||||
SGML systems and applications as defined in ISO 8879, provided |
|
||||||
this notice is included in all copies. |
|
||||||
--> |
|
||||||
|
|
||||||
<!-- Relevant ISO entity set is given unless names are newly introduced. |
|
||||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
|
||||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
|
||||||
numbers are given for each character, in hex. Entity values are |
|
||||||
decimal conversions of the ISO 10646 values and refer to the |
|
||||||
document character set. Names are Unicode [UNICODE] names. |
|
||||||
--> |
|
||||||
|
|
||||||
<!-- Latin Extended-B --> |
|
||||||
<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function |
|
||||||
= florin, U+0192 ISOtech --> |
|
||||||
|
|
||||||
<!-- Greek --> |
|
||||||
<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 --> |
|
||||||
<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 --> |
|
||||||
<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 --> |
|
||||||
<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 --> |
|
||||||
<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 --> |
|
||||||
<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 --> |
|
||||||
<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 --> |
|
||||||
<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 --> |
|
||||||
<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 --> |
|
||||||
<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A --> |
|
||||||
<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 --> |
|
||||||
<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C --> |
|
||||||
<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D --> |
|
||||||
<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 --> |
|
||||||
<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F --> |
|
||||||
<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 --> |
|
||||||
<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 --> |
|
||||||
<!-- there is no Sigmaf, and no U+03A2 character either --> |
|
||||||
<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 --> |
|
||||||
<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 --> |
|
||||||
<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon, |
|
||||||
U+03A5 ISOgrk3 --> |
|
||||||
<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 --> |
|
||||||
<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 --> |
|
||||||
<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 --> |
|
||||||
<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 --> |
|
||||||
<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 --> |
|
||||||
<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 --> |
|
||||||
<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 --> |
|
||||||
<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 --> |
|
||||||
<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 --> |
|
||||||
<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 --> |
|
||||||
<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 --> |
|
||||||
<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 --> |
|
||||||
<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 --> |
|
||||||
<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 --> |
|
||||||
<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 --> |
|
||||||
<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 --> |
|
||||||
<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 --> |
|
||||||
<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 --> |
|
||||||
<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW --> |
|
||||||
<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 --> |
|
||||||
<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 --> |
|
||||||
<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 --> |
|
||||||
<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 --> |
|
||||||
<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 --> |
|
||||||
<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 --> |
|
||||||
<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 --> |
|
||||||
<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 --> |
|
||||||
<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 --> |
|
||||||
<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 --> |
|
||||||
<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW --> |
|
||||||
<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW --> |
|
||||||
<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 --> |
|
||||||
|
|
||||||
<!-- General Punctuation --> |
|
||||||
<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub --> |
|
||||||
<!-- bullet is NOT the same as bullet operator, U+2219 --> |
|
||||||
<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub --> |
|
||||||
<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech --> |
|
||||||
<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech --> |
|
||||||
<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW --> |
|
||||||
<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW --> |
|
||||||
|
|
||||||
<!-- Letterlike Symbols --> |
|
||||||
<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso --> |
|
||||||
<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso --> |
|
||||||
<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso --> |
|
||||||
<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum --> |
|
||||||
<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW --> |
|
||||||
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although |
|
||||||
the same glyph could be used to depict both characters --> |
|
||||||
|
|
||||||
<!-- Arrows --> |
|
||||||
<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum --> |
|
||||||
<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum--> |
|
||||||
<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum --> |
|
||||||
<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum --> |
|
||||||
<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa --> |
|
||||||
<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards |
|
||||||
= carriage return, U+21B5 NEW --> |
|
||||||
<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech --> |
|
||||||
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow |
|
||||||
but also does not have any other character for that function. So ? lArr can |
|
||||||
be used for 'is implied by' as ISOtech suggests --> |
|
||||||
<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa --> |
|
||||||
<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech --> |
|
||||||
<!-- Unicode does not say this is the 'implies' character but does not have |
|
||||||
another character with this function so ? |
|
||||||
rArr can be used for 'implies' as ISOtech suggests --> |
|
||||||
<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa --> |
|
||||||
<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa --> |
|
||||||
|
|
||||||
<!-- Mathematical Operators --> |
|
||||||
<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech --> |
|
||||||
<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech --> |
|
||||||
<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech --> |
|
||||||
<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso --> |
|
||||||
<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech --> |
|
||||||
<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech --> |
|
||||||
<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech --> |
|
||||||
<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech --> |
|
||||||
<!-- should there be a more memorable name than 'ni'? --> |
|
||||||
<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb --> |
|
||||||
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though |
|
||||||
the same glyph might be used for both --> |
|
||||||
<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb --> |
|
||||||
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' |
|
||||||
though the same glyph might be used for both --> |
|
||||||
<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech --> |
|
||||||
<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech --> |
|
||||||
<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech --> |
|
||||||
<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech --> |
|
||||||
<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech --> |
|
||||||
<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso --> |
|
||||||
<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech --> |
|
||||||
<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech --> |
|
||||||
<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech --> |
|
||||||
<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech --> |
|
||||||
<!ENTITY int "∫" ><!-- integral, U+222B ISOtech --> |
|
||||||
<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech --> |
|
||||||
<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech --> |
|
||||||
<!-- tilde operator is NOT the same character as the tilde, U+007E, |
|
||||||
although the same glyph might be used to represent both --> |
|
||||||
<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech --> |
|
||||||
<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr --> |
|
||||||
<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech --> |
|
||||||
<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech --> |
|
||||||
<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech --> |
|
||||||
<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech --> |
|
||||||
<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech --> |
|
||||||
<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech --> |
|
||||||
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol |
|
||||||
font encoding and is not included. Should it be, for symmetry? |
|
||||||
It is in ISOamsn --> |
|
||||||
<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn --> |
|
||||||
<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech --> |
|
||||||
<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech --> |
|
||||||
<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb --> |
|
||||||
<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb --> |
|
||||||
<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech --> |
|
||||||
<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb --> |
|
||||||
<!-- dot operator is NOT the same character as U+00B7 middle dot --> |
|
||||||
|
|
||||||
<!-- Miscellaneous Technical --> |
|
||||||
<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc --> |
|
||||||
<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc --> |
|
||||||
<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc --> |
|
||||||
<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc --> |
|
||||||
<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech --> |
|
||||||
<!-- lang is NOT the same character as U+003C 'less than' |
|
||||||
or U+2039 'single left-pointing angle quotation mark' --> |
|
||||||
<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech --> |
|
||||||
<!-- rang is NOT the same character as U+003E 'greater than' |
|
||||||
or U+203A 'single right-pointing angle quotation mark' --> |
|
||||||
|
|
||||||
<!-- Geometric Shapes --> |
|
||||||
<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub --> |
|
||||||
|
|
||||||
<!-- Miscellaneous Symbols --> |
|
||||||
<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub --> |
|
||||||
<!-- black here seems to mean filled as opposed to hollow --> |
|
||||||
<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub --> |
|
||||||
<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub --> |
|
||||||
<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub --> |
|
||||||
|
|
||||||
<!-- end of xhtml-symbol.ent --> |
|
||||||
""" |
|
||||||
return text |
|
||||||
|
|
||||||
def get_apache_license(): |
|
||||||
license = r"""/** |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with |
|
||||||
* this work for additional information regarding copyright ownership. |
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
* (the "License"); you may not use this file except in compliance with |
|
||||||
* the License. You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
* See the License for the specific language governing permissions and |
|
||||||
* limitations under the License. |
|
||||||
*/ |
|
||||||
|
|
||||||
""" |
|
||||||
return license |
|
||||||
|
|
||||||
main() |
|
@ -1,366 +0,0 @@ |
|||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
import types |
|
||||||
import os |
|
||||||
import sys |
|
||||||
import random |
|
||||||
|
|
||||||
MAX_UNICODE = 0x10FFFF |
|
||||||
|
|
||||||
# TODO |
|
||||||
# - could be more minimal |
|
||||||
# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges |
|
||||||
# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does |
|
||||||
|
|
||||||
# MASKS[0] is bottom 1-bit |
|
||||||
# MASKS[1] is bottom 2-bits |
|
||||||
# ... |
|
||||||
|
|
||||||
utf8Ranges = [(0, 127), |
|
||||||
(128, 2047), |
|
||||||
(2048, 65535), |
|
||||||
(65536, 1114111)] |
|
||||||
|
|
||||||
typeToColor = {'startend': 'purple', |
|
||||||
'start': 'blue', |
|
||||||
'end': 'red'} |
|
||||||
|
|
||||||
class FSA: |
|
||||||
|
|
||||||
def __init__(self): |
|
||||||
# maps fromNode -> (startUTF8, endUTF8, endNode) |
|
||||||
self.states = {} |
|
||||||
self.nodeUpto = 0 |
|
||||||
|
|
||||||
def run(self, bytes): |
|
||||||
state = self.start |
|
||||||
for b in bytes: |
|
||||||
found = False |
|
||||||
oldState = state |
|
||||||
for label, s, e, n in self.states[state][1:]: |
|
||||||
if b >= s and b <= e: |
|
||||||
if found: |
|
||||||
raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b)) |
|
||||||
state = n |
|
||||||
found = True |
|
||||||
if not found: |
|
||||||
return -1 |
|
||||||
|
|
||||||
return state |
|
||||||
|
|
||||||
def addEdge(self, n1, n2, v1, v2, label): |
|
||||||
""" |
|
||||||
Adds edge from n1-n2, utf8 byte range v1-v2. |
|
||||||
""" |
|
||||||
assert n1 in self.states |
|
||||||
assert type(v1) is types.IntType |
|
||||||
assert type(v2) is types.IntType |
|
||||||
self.states[n1].append((label, v1, v2, n2)) |
|
||||||
|
|
||||||
def addNode(self, label=None): |
|
||||||
try: |
|
||||||
self.states[self.nodeUpto] = [label] |
|
||||||
return self.nodeUpto |
|
||||||
finally: |
|
||||||
self.nodeUpto += 1 |
|
||||||
|
|
||||||
def toDOT(self, label): |
|
||||||
__l = [] |
|
||||||
w = __l.append |
|
||||||
endNode = startNode = None |
|
||||||
for id, details in self.states.items(): |
|
||||||
name = details[0] |
|
||||||
if name == 'end': |
|
||||||
endNode = id |
|
||||||
elif name == 'start': |
|
||||||
startNode = id |
|
||||||
|
|
||||||
w('digraph %s {' % label) |
|
||||||
w(' rankdir=LR;') |
|
||||||
w(' size="8,5";') |
|
||||||
w(' node [color=white label=""]; Ns;') |
|
||||||
|
|
||||||
w(' node [color=black];') |
|
||||||
w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode)) |
|
||||||
w(' node [shape=circle];') |
|
||||||
|
|
||||||
w(' N%s [label="%s"];' % (startNode, startNode)) |
|
||||||
w(' Ns -> N%s;' % startNode) |
|
||||||
for id, details in self.states.items(): |
|
||||||
edges = details[1:] |
|
||||||
w(' N%s [label="%s"];' % (id, id)) |
|
||||||
for type, s, e, dest in edges: |
|
||||||
c = typeToColor.get(type, 'black') |
|
||||||
if type == 'all*': |
|
||||||
# special case -- matches any utf8 byte at this point |
|
||||||
label = '*' |
|
||||||
elif s == e: |
|
||||||
label = '%s' % binary(s) |
|
||||||
else: |
|
||||||
label = '%s-%s' % (binary(s), binary(e)) |
|
||||||
w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c)) |
|
||||||
if name == 'end': |
|
||||||
endNode = id |
|
||||||
elif name == 'start': |
|
||||||
startNode = id |
|
||||||
w('}') |
|
||||||
return '\n'.join(__l) |
|
||||||
|
|
||||||
def toPNG(self, label, pngOut): |
|
||||||
open('tmp.dot', 'wb').write(self.toDOT(label)) |
|
||||||
if os.system('dot -Tpng tmp.dot -o %s' % pngOut): |
|
||||||
raise RuntimeException('dot failed') |
|
||||||
|
|
||||||
|
|
||||||
MASKS = [] |
|
||||||
v = 2 |
|
||||||
for i in range(32): |
|
||||||
MASKS.append(v-1) |
|
||||||
v *= 2 |
|
||||||
|
|
||||||
def binary(x): |
|
||||||
if x == 0: |
|
||||||
return '00000000' |
|
||||||
|
|
||||||
l = [] |
|
||||||
while x > 0: |
|
||||||
if x & 1 == 1: |
|
||||||
l.append('1') |
|
||||||
else: |
|
||||||
l.append('0') |
|
||||||
x = x >> 1 |
|
||||||
|
|
||||||
# big endian! |
|
||||||
l.reverse() |
|
||||||
|
|
||||||
l2 = [] |
|
||||||
while len(l) > 0: |
|
||||||
s = ''.join(l[-8:]) |
|
||||||
if len(s) < 8: |
|
||||||
s = '0'*(8-len(s)) + s |
|
||||||
l2.append(s) |
|
||||||
del l[-8:] |
|
||||||
|
|
||||||
return ' '.join(l2) |
|
||||||
|
|
||||||
def getUTF8Rest(code, numBytes): |
|
||||||
l = [] |
|
||||||
for i in range(numBytes): |
|
||||||
l.append((128 | (code & MASKS[5]), 6)) |
|
||||||
code = code >> 6 |
|
||||||
l.reverse() |
|
||||||
return tuple(l) |
|
||||||
|
|
||||||
def toUTF8(code): |
|
||||||
# code = Unicode code point |
|
||||||
assert code >= 0 |
|
||||||
assert code <= MAX_UNICODE |
|
||||||
|
|
||||||
if code < 128: |
|
||||||
# 0xxxxxxx |
|
||||||
bytes = ((code, 7),) |
|
||||||
elif code < 2048: |
|
||||||
# 110yyyxx 10xxxxxx |
|
||||||
byte1 = (6 << 5) | (code >> 6) |
|
||||||
bytes = ((byte1, 5),) + getUTF8Rest(code, 1) |
|
||||||
elif code < 65536: |
|
||||||
# 1110yyyy 10yyyyxx 10xxxxxx |
|
||||||
len = 3 |
|
||||||
byte1 = (14 << 4) | (code >> 12) |
|
||||||
bytes = ((byte1, 4),) + getUTF8Rest(code, 2) |
|
||||||
else: |
|
||||||
# 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
|
||||||
len = 4 |
|
||||||
byte1 = (30 << 3) | (code >> 18) |
|
||||||
bytes = ((byte1, 3),) + getUTF8Rest(code, 3) |
|
||||||
|
|
||||||
return bytes |
|
||||||
|
|
||||||
def all(fsa, startNode, endNode, startCode, endCode, left): |
|
||||||
if len(left) == 0: |
|
||||||
fsa.addEdge(startNode, endNode, startCode, endCode, 'all') |
|
||||||
else: |
|
||||||
lastN = fsa.addNode() |
|
||||||
fsa.addEdge(startNode, lastN, startCode, endCode, 'all') |
|
||||||
while len(left) > 1: |
|
||||||
n = fsa.addNode() |
|
||||||
fsa.addEdge(lastN, n, 128, 191, 'all*') |
|
||||||
left = left[1:] |
|
||||||
lastN = n |
|
||||||
fsa.addEdge(lastN, endNode, 128, 191, 'all*') |
|
||||||
|
|
||||||
def start(fsa, startNode, endNode, utf8, doAll): |
|
||||||
if len(utf8) == 1: |
|
||||||
fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start') |
|
||||||
else: |
|
||||||
n = fsa.addNode() |
|
||||||
fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start') |
|
||||||
start(fsa, n, endNode, utf8[1:], True) |
|
||||||
end = utf8[0][0] | MASKS[utf8[0][1]-1] |
|
||||||
if doAll and utf8[0][0] != end: |
|
||||||
all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:]) |
|
||||||
|
|
||||||
def end(fsa, startNode, endNode, utf8, doAll): |
|
||||||
if len(utf8) == 1: |
|
||||||
fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end') |
|
||||||
else: |
|
||||||
if utf8[0][1] == 5: |
|
||||||
# special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences): |
|
||||||
start = 194 |
|
||||||
else: |
|
||||||
start = utf8[0][0] & (~MASKS[utf8[0][1]-1]) |
|
||||||
if doAll and utf8[0][0] != start: |
|
||||||
all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:]) |
|
||||||
n = fsa.addNode() |
|
||||||
fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end') |
|
||||||
end(fsa, n, endNode, utf8[1:], True) |
|
||||||
|
|
||||||
def build(fsa, |
|
||||||
startNode, endNode, |
|
||||||
startUTF8, endUTF8): |
|
||||||
|
|
||||||
# Break into start, middle, end: |
|
||||||
if startUTF8[0][0] == endUTF8[0][0]: |
|
||||||
# Degen case: lead with the same byte: |
|
||||||
if len(startUTF8) == 1 and len(endUTF8) == 1: |
|
||||||
fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') |
|
||||||
return |
|
||||||
else: |
|
||||||
assert len(startUTF8) != 1 |
|
||||||
assert len(endUTF8) != 1 |
|
||||||
n = fsa.addNode() |
|
||||||
# single value edge |
|
||||||
fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single') |
|
||||||
build(fsa, n, endNode, startUTF8[1:], endUTF8[1:]) |
|
||||||
elif len(startUTF8) == len(endUTF8): |
|
||||||
if len(startUTF8) == 1: |
|
||||||
fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') |
|
||||||
else: |
|
||||||
start(fsa, startNode, endNode, startUTF8, False) |
|
||||||
if endUTF8[0][0] - startUTF8[0][0] > 1: |
|
||||||
all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:]) |
|
||||||
end(fsa, startNode, endNode, endUTF8, False) |
|
||||||
else: |
|
||||||
# start |
|
||||||
start(fsa, startNode, endNode, startUTF8, True) |
|
||||||
|
|
||||||
# possibly middle |
|
||||||
byteCount = 1+len(startUTF8) |
|
||||||
while byteCount < len(endUTF8): |
|
||||||
s = toUTF8(utf8Ranges[byteCount-1][0]) |
|
||||||
e = toUTF8(utf8Ranges[byteCount-1][1]) |
|
||||||
all(fsa, startNode, endNode, |
|
||||||
s[0][0], |
|
||||||
e[0][0], |
|
||||||
s[1:]) |
|
||||||
byteCount += 1 |
|
||||||
|
|
||||||
# end |
|
||||||
end(fsa, startNode, endNode, endUTF8, True) |
|
||||||
|
|
||||||
def main(): |
|
||||||
|
|
||||||
if len(sys.argv) not in (3, 4): |
|
||||||
print |
|
||||||
print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0] |
|
||||||
print |
|
||||||
sys.exit(1) |
|
||||||
|
|
||||||
utf32Start = int(sys.argv[1]) |
|
||||||
utf32End = int(sys.argv[2]) |
|
||||||
|
|
||||||
if utf32Start > utf32End: |
|
||||||
print 'ERROR: start must be <= end' |
|
||||||
sys.exit(1) |
|
||||||
|
|
||||||
fsa = FSA() |
|
||||||
fsa.start = fsa.addNode('start') |
|
||||||
fsa.end = fsa.addNode('end') |
|
||||||
|
|
||||||
print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)]) |
|
||||||
print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)]) |
|
||||||
|
|
||||||
if len(sys.argv) == 4: |
|
||||||
print 't=%s [%s]' % \ |
|
||||||
(' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]), |
|
||||||
' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))])) |
|
||||||
|
|
||||||
build(fsa, fsa.start, fsa.end, |
|
||||||
toUTF8(utf32Start), |
|
||||||
toUTF8(utf32End)) |
|
||||||
|
|
||||||
fsa.toPNG('test', '/tmp/outpy.png') |
|
||||||
print 'Saved to /tmp/outpy.png...' |
|
||||||
|
|
||||||
test(fsa, utf32Start, utf32End, 100000); |
|
||||||
|
|
||||||
def test(fsa, utf32Start, utf32End, count): |
|
||||||
|
|
||||||
# verify correct ints are accepted |
|
||||||
for i in range(count): |
|
||||||
r = random.randint(utf32Start, utf32End) |
|
||||||
dest = fsa.run([tup[0] for tup in toUTF8(r)]) |
|
||||||
if dest != fsa.end: |
|
||||||
print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) |
|
||||||
return False |
|
||||||
|
|
||||||
invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1) |
|
||||||
if invalidRange >= 0: |
|
||||||
# verify invalid ints are not accepted |
|
||||||
for i in range(count): |
|
||||||
r = random.randint(0, invalidRange-1) |
|
||||||
if r >= utf32Start: |
|
||||||
r = utf32End + 1 + r - utf32Start |
|
||||||
dest = fsa.run([tup[0] for tup in toUTF8(r)]) |
|
||||||
if dest != -1: |
|
||||||
print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) |
|
||||||
return False |
|
||||||
|
|
||||||
return True |
|
||||||
|
|
||||||
def stress(): |
|
||||||
|
|
||||||
print 'Testing...' |
|
||||||
|
|
||||||
iter = 0 |
|
||||||
while True: |
|
||||||
if iter % 10 == 0: |
|
||||||
print '%s...' % iter |
|
||||||
iter += 1 |
|
||||||
|
|
||||||
v1 = random.randint(0, MAX_UNICODE) |
|
||||||
v2 = random.randint(0, MAX_UNICODE) |
|
||||||
if v2 < v1: |
|
||||||
v1, v2 = v2, v1 |
|
||||||
|
|
||||||
utf32Start = v1 |
|
||||||
utf32End = v2 |
|
||||||
|
|
||||||
fsa = FSA() |
|
||||||
fsa.start = fsa.addNode('start') |
|
||||||
fsa.end = fsa.addNode('end') |
|
||||||
build(fsa, fsa.start, fsa.end, |
|
||||||
toUTF8(utf32Start), |
|
||||||
toUTF8(utf32End)) |
|
||||||
|
|
||||||
if not test(fsa, utf32Start, utf32End, 10000): |
|
||||||
print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End) |
|
||||||
|
|
||||||
if __name__ == '__main__': |
|
||||||
if len(sys.argv) > 1: |
|
||||||
main() |
|
||||||
else: |
|
||||||
stress() |
|
@ -1,500 +0,0 @@ |
|||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
# Note, this file is known to work with rev 120 of the moman |
|
||||||
# repository (http://bitbucket.org/jpbarrette/moman/overview) |
|
||||||
# |
|
||||||
# See also: http://sites.google.com/site/rrettesite/moman |
|
||||||
|
|
||||||
import math |
|
||||||
import os |
|
||||||
import sys |
|
||||||
#sys.path.insert(0, 'moman/finenight/python') |
|
||||||
sys.path.insert(0, '../../../../../../../../build/core/moman/finenight/python') |
|
||||||
try: |
|
||||||
from possibleStates import genTransitions |
|
||||||
except ImportError: |
|
||||||
from finenight.possibleStates import genTransitions |
|
||||||
|
|
||||||
MODE = 'array' |
|
||||||
PACKED = True |
|
||||||
WORD = 64 |
|
||||||
LOG2_WORD = int(math.log(WORD)/math.log(2)) |
|
||||||
#MODE = 'switch' |
|
||||||
|
|
||||||
class LineOutput: |
|
||||||
|
|
||||||
def __init__(self, indent=''): |
|
||||||
self.l = [] |
|
||||||
self._indent = self.startIndent = indent |
|
||||||
self.inComment = False |
|
||||||
|
|
||||||
def __call__(self, s, indent=0): |
|
||||||
if s.find('}') != -1: |
|
||||||
assert self._indent != self.startIndent |
|
||||||
self._indent = self._indent[:-2] |
|
||||||
|
|
||||||
if indent != 0: |
|
||||||
indent0 = ' ' * (len(self._indent)/2+indent) |
|
||||||
else: |
|
||||||
indent0 = self._indent |
|
||||||
|
|
||||||
if s.find('/*') != -1: |
|
||||||
if s.find('*/') == -1: |
|
||||||
self.inComment = True |
|
||||||
elif s.find('*/') != -1: |
|
||||||
self.inComment = True |
|
||||||
|
|
||||||
if self.inComment: |
|
||||||
self.l.append(indent0 + s) |
|
||||||
else: |
|
||||||
self.l.append(indent0 + s.lstrip()) |
|
||||||
|
|
||||||
self.inComment = self.inComment and s.find('*/') == -1 |
|
||||||
|
|
||||||
if s.find('{') != -1: |
|
||||||
self._indent += ' ' |
|
||||||
|
|
||||||
def __str__(self): |
|
||||||
if True: |
|
||||||
assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \ |
|
||||||
(len(self._indent), len(self.startIndent)) |
|
||||||
return '\n'.join(self.l) |
|
||||||
|
|
||||||
def indent(self): |
|
||||||
self._indent += ' ' |
|
||||||
|
|
||||||
def outdent(self): |
|
||||||
assert self._indent != self.startIndent |
|
||||||
self._indent = self._indent[:-2] |
|
||||||
|
|
||||||
def charVarNumber(charVar): |
|
||||||
""" |
|
||||||
Maps binary number (eg [1, 0, 1]) to its decimal value (5). |
|
||||||
""" |
|
||||||
|
|
||||||
p = 1 |
|
||||||
sum = 0 |
|
||||||
downTo = len(charVar)-1 |
|
||||||
while downTo >= 0: |
|
||||||
sum += p * int(charVar[downTo]) |
|
||||||
p *= 2 |
|
||||||
downTo -= 1 |
|
||||||
return sum |
|
||||||
|
|
||||||
def main(): |
|
||||||
|
|
||||||
if len(sys.argv) != 3: |
|
||||||
print |
|
||||||
print 'Usage: python -u %s N <True/False>' % sys.argv[0] |
|
||||||
print |
|
||||||
print 'NOTE: the resulting .java file is created in the current working dir!' |
|
||||||
print |
|
||||||
sys.exit(1) |
|
||||||
|
|
||||||
n = int(sys.argv[1]) |
|
||||||
|
|
||||||
transpose = (sys.argv[2] == "True") |
|
||||||
|
|
||||||
tables = genTransitions(n, transpose) |
|
||||||
|
|
||||||
stateMap = {} |
|
||||||
|
|
||||||
# init null state |
|
||||||
stateMap['[]'] = -1 |
|
||||||
|
|
||||||
# init start state |
|
||||||
stateMap['[(0, 0)]'] = 0 |
|
||||||
|
|
||||||
w = LineOutput() |
|
||||||
|
|
||||||
w('package com.fr.third.org.apache.lucene.util.automaton;') |
|
||||||
w('') |
|
||||||
w('/*') |
|
||||||
w(' * Licensed to the Apache Software Foundation (ASF) under one or more') |
|
||||||
w(' * contributor license agreements. See the NOTICE file distributed with') |
|
||||||
w(' * this work for additional information regarding copyright ownership.') |
|
||||||
w(' * The ASF licenses this file to You under the Apache License, Version 2.0') |
|
||||||
w(' * (the "License"); you may not use this file except in compliance with') |
|
||||||
w(' * the License. You may obtain a copy of the License at') |
|
||||||
w(' *') |
|
||||||
w(' * http://www.apache.org/licenses/LICENSE-2.0') |
|
||||||
w(' *') |
|
||||||
w(' * Unless required by applicable law or agreed to in writing, software') |
|
||||||
w(' * distributed under the License is distributed on an "AS IS" BASIS,') |
|
||||||
w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') |
|
||||||
w(' * See the License for the specific language governing permissions and') |
|
||||||
w(' * limitations under the License.') |
|
||||||
w(' */') |
|
||||||
w('') |
|
||||||
w('// The following code was generated with the moman/finenight pkg') |
|
||||||
w('// This package is available under the MIT License, see NOTICE.txt') |
|
||||||
w('// for more details.') |
|
||||||
w('') |
|
||||||
w('import com.fr.third.org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') |
|
||||||
w('') |
|
||||||
if transpose: |
|
||||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) |
|
||||||
w(' with transpositions as primitive edits */') |
|
||||||
className = 'Lev%dTParametricDescription' % n |
|
||||||
else: |
|
||||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) |
|
||||||
className = 'Lev%dParametricDescription' % n |
|
||||||
|
|
||||||
w('class %s extends ParametricDescription {' % className) |
|
||||||
|
|
||||||
w('') |
|
||||||
w('@Override') |
|
||||||
w('int transition(int absState, int position, int vector) {') |
|
||||||
|
|
||||||
w(' // null absState should never be passed in') |
|
||||||
w(' assert absState != -1;') |
|
||||||
|
|
||||||
w('') |
|
||||||
w(' // decode absState -> state, offset') |
|
||||||
w(' int state = absState/(w+1);') |
|
||||||
w(' int offset = absState%(w+1);') |
|
||||||
w(' assert offset >= 0;') |
|
||||||
w('') |
|
||||||
|
|
||||||
machines = [] |
|
||||||
|
|
||||||
for i, map in enumerate(tables): |
|
||||||
if i == 0: |
|
||||||
w('if (position == w) {') |
|
||||||
elif i == len(tables)-1: |
|
||||||
w('} else {') |
|
||||||
else: |
|
||||||
w('} else if (position == w-%d) {' % i) |
|
||||||
|
|
||||||
if i != 0 and MODE == 'switch': |
|
||||||
w('switch(vector) {') |
|
||||||
|
|
||||||
l = map.items() |
|
||||||
l.sort() |
|
||||||
|
|
||||||
numCasesPerVector = None |
|
||||||
numVectors = len(l) |
|
||||||
|
|
||||||
if MODE == 'array': |
|
||||||
toStateArray = [] |
|
||||||
toOffsetIncrArray = [] |
|
||||||
|
|
||||||
for charVar, states in l: |
|
||||||
|
|
||||||
# somehow it's a string: |
|
||||||
charVar = eval(charVar) |
|
||||||
|
|
||||||
if i != 0 and MODE == 'switch': |
|
||||||
w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) |
|
||||||
w.indent() |
|
||||||
|
|
||||||
l = states.items() |
|
||||||
|
|
||||||
byFromState = {} |
|
||||||
|
|
||||||
# first pass to assign states |
|
||||||
byAction = {} |
|
||||||
for s, (toS, offset) in l: |
|
||||||
state = str(s) |
|
||||||
|
|
||||||
toState = str(toS) |
|
||||||
if state not in stateMap: |
|
||||||
stateMap[state] = len(stateMap)-1 |
|
||||||
if toState not in stateMap: |
|
||||||
stateMap[toState] = len(stateMap)-1 |
|
||||||
|
|
||||||
byFromState[stateMap[state]] = (1+stateMap[toState], offset) |
|
||||||
|
|
||||||
fromStateDesc = s[1:len(s)-1] |
|
||||||
toStateDesc = ', '.join([str(x) for x in toS]) |
|
||||||
|
|
||||||
tup = (stateMap[toState], toStateDesc, offset) |
|
||||||
if tup not in byAction: |
|
||||||
byAction[tup] = [] |
|
||||||
byAction[tup].append((fromStateDesc, stateMap[state])) |
|
||||||
|
|
||||||
if numCasesPerVector is None: |
|
||||||
numCasesPerVector = len(l) |
|
||||||
else: |
|
||||||
# we require this to be uniform... empirically it seems to be! |
|
||||||
assert numCasesPerVector == len(l) |
|
||||||
|
|
||||||
if MODE == 'array': |
|
||||||
|
|
||||||
for s in range(numCasesPerVector): |
|
||||||
toState, offsetIncr = byFromState[s] |
|
||||||
toStateArray.append(toState) |
|
||||||
toOffsetIncrArray.append(offsetIncr) |
|
||||||
|
|
||||||
else: |
|
||||||
|
|
||||||
# render switches |
|
||||||
w('switch(state) { // %s cases' % len(l)) |
|
||||||
|
|
||||||
for (toState, toStateDesc, offset), lx in byAction.items(): |
|
||||||
for fromStateDesc, fromState in lx: |
|
||||||
w('case %s: // %s' % (fromState, fromStateDesc)) |
|
||||||
w.indent() |
|
||||||
w(' state = %s; // %s' % (toState, toStateDesc)) |
|
||||||
if offset > 0: |
|
||||||
w(' offset += %s;' % offset) |
|
||||||
w('break;') |
|
||||||
w.outdent() |
|
||||||
|
|
||||||
w('}') |
|
||||||
if i != 0: |
|
||||||
w('break;') |
|
||||||
w.outdent() |
|
||||||
|
|
||||||
if MODE == 'array': |
|
||||||
# strangely state can come in wildly out of bounds.... |
|
||||||
w(' if (state < %d) {' % numCasesPerVector) |
|
||||||
w(' final int loc = vector * %d + state;' % numCasesPerVector) |
|
||||||
if PACKED: |
|
||||||
w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) |
|
||||||
w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) |
|
||||||
else: |
|
||||||
w(' offset += offsetIncrs%d[loc];' % i) |
|
||||||
w(' state = toStates%d[loc]-1;' % i) |
|
||||||
w(' }') |
|
||||||
elif i != 0: |
|
||||||
w('}') |
|
||||||
|
|
||||||
machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) |
|
||||||
|
|
||||||
# ends switch statement for machine |
|
||||||
w('}') |
|
||||||
|
|
||||||
w('') |
|
||||||
|
|
||||||
w(' if (state == -1) {') |
|
||||||
w(' // null state') |
|
||||||
w(' return -1;') |
|
||||||
w(' } else {') |
|
||||||
w(' // translate back to abs') |
|
||||||
w(' return state*(w+1)+offset;') |
|
||||||
w(' }') |
|
||||||
|
|
||||||
# ends transition method |
|
||||||
w('}') |
|
||||||
|
|
||||||
subs = [] |
|
||||||
if MODE == 'array': |
|
||||||
w.indent() |
|
||||||
for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): |
|
||||||
w('') |
|
||||||
w.outdent() |
|
||||||
w('// %d vectors; %d states per vector; array length = %d' % \ |
|
||||||
(numVectors, numCasesPerVector, numVectors*numCasesPerVector)) |
|
||||||
w.indent() |
|
||||||
if PACKED: |
|
||||||
# pack in python |
|
||||||
l, nbits = pack(toStateArray) |
|
||||||
subs.append(('NBITSSTATES%d' % i, str(nbits))) |
|
||||||
w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ |
|
||||||
(i, nbits, renderList([hex(long(x)) for x in l]))) |
|
||||||
|
|
||||||
l, nbits = pack(toOffsetIncrsArray) |
|
||||||
subs.append(('NBITSOFFSET%d' % i, str(nbits))) |
|
||||||
w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ |
|
||||||
(i, nbits, renderList([hex(long(x)) for x in l]))) |
|
||||||
else: |
|
||||||
w(' private final static int[] toStates%d = new int[] %s;' % \ |
|
||||||
(i, renderList([str(x) for x in toStateArray]))) |
|
||||||
w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ |
|
||||||
(i, renderList([str(x) for x in toStateArray]))) |
|
||||||
w.outdent() |
|
||||||
|
|
||||||
stateMap2 = dict([[v,k] for k,v in stateMap.items()]) |
|
||||||
w('') |
|
||||||
w('// state map') |
|
||||||
sum = 0 |
|
||||||
minErrors = [] |
|
||||||
for i in xrange(len(stateMap2)-1): |
|
||||||
w('// %s -> %s' % (i, stateMap2[i])) |
|
||||||
# we replace t-notation as its not relevant here |
|
||||||
st = stateMap2[i].replace('t', '') |
|
||||||
|
|
||||||
v = eval(st) |
|
||||||
minError = min([-i+e for i, e in v]) |
|
||||||
c = len(v) |
|
||||||
sum += c |
|
||||||
minErrors.append(minError) |
|
||||||
w('') |
|
||||||
|
|
||||||
w.indent() |
|
||||||
#w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) |
|
||||||
|
|
||||||
w.outdent() |
|
||||||
|
|
||||||
w('') |
|
||||||
w(' public %s(int w) {' % className) |
|
||||||
w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) |
|
||||||
w(' }') |
|
||||||
|
|
||||||
if 0: |
|
||||||
w('') |
|
||||||
w('@Override') |
|
||||||
w('public int size() { // this can now move up?') |
|
||||||
w(' return %d*(w+1);' % (len(stateMap2)-1)) |
|
||||||
w('}') |
|
||||||
|
|
||||||
w('') |
|
||||||
w('@Override') |
|
||||||
w('public int getPosition(int absState) { // this can now move up?') |
|
||||||
w(' return absState % (w+1);') |
|
||||||
w('}') |
|
||||||
|
|
||||||
w('') |
|
||||||
w('@Override') |
|
||||||
w('public boolean isAccept(int absState) { // this can now move up?') |
|
||||||
w(' // decode absState -> state, offset') |
|
||||||
w(' int state = absState/(w+1);') |
|
||||||
w(' if (true || state < minErrors.length) {') |
|
||||||
w(' int offset = absState%(w+1);') |
|
||||||
w(' assert offset >= 0;') |
|
||||||
w(' return w - offset + minErrors[state] <= %d;' % n) |
|
||||||
w(' } else {') |
|
||||||
w(' return false;') |
|
||||||
w(' }') |
|
||||||
w('}') |
|
||||||
|
|
||||||
if MODE == 'array' and PACKED: |
|
||||||
|
|
||||||
# we moved into super class |
|
||||||
if False: |
|
||||||
w('') |
|
||||||
|
|
||||||
v = 2 |
|
||||||
l = [] |
|
||||||
for i in range(63): |
|
||||||
l.append(hex(v-1)) |
|
||||||
v *= 2 |
|
||||||
|
|
||||||
w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) |
|
||||||
w('') |
|
||||||
|
|
||||||
# unpack in java |
|
||||||
w('private int unpack(long[] data, int index, int bitsPerValue) {') |
|
||||||
w(' final long bitLoc = bitsPerValue * index;') |
|
||||||
w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) |
|
||||||
w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1)) |
|
||||||
w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') |
|
||||||
w(' if (bitStart + bitsPerValue <= %d) {' % WORD) |
|
||||||
w(' // not split') |
|
||||||
w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') |
|
||||||
w(' } else {') |
|
||||||
w(' // split') |
|
||||||
w(' final int part = %d-bitStart;' % WORD) |
|
||||||
w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') |
|
||||||
w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) |
|
||||||
w(' }') |
|
||||||
w('}') |
|
||||||
|
|
||||||
# class |
|
||||||
w('}') |
|
||||||
w('') |
|
||||||
|
|
||||||
fileOut = '%s.java' % className |
|
||||||
|
|
||||||
s = str(w) |
|
||||||
for sub, repl in subs: |
|
||||||
s = s.replace(sub, repl) |
|
||||||
|
|
||||||
open(fileOut, 'wb').write(s) |
|
||||||
|
|
||||||
print 'Wrote %s [%d lines; %.1f KB]' % \ |
|
||||||
(fileOut, len(w.l), os.path.getsize(fileOut)/1024.) |
|
||||||
|
|
||||||
def renderList(l): |
|
||||||
lx = [' '] |
|
||||||
for i in xrange(len(l)): |
|
||||||
if i > 0: |
|
||||||
lx.append(',') |
|
||||||
if i % 4 == 0: |
|
||||||
lx.append('\n ') |
|
||||||
lx.append(l[i]) |
|
||||||
return '{\n%s\n }' % ''.join(lx) |
|
||||||
|
|
||||||
MASKS = [] |
|
||||||
v = 2 |
|
||||||
for i in xrange(63): |
|
||||||
MASKS.append(v-1) |
|
||||||
v *= 2 |
|
||||||
|
|
||||||
# packs into longs; returns long[], numBits |
|
||||||
def pack(l): |
|
||||||
maxV = max(l) |
|
||||||
bitsPerValue = max(1, int(math.ceil(math.log(maxV+1)/math.log(2.0)))) |
|
||||||
|
|
||||||
bitsLeft = WORD |
|
||||||
pendingValue = 0 |
|
||||||
|
|
||||||
packed = [] |
|
||||||
for i in xrange(len(l)): |
|
||||||
v = l[i] |
|
||||||
if pendingValue > 0: |
|
||||||
bitsUsed = math.ceil(math.log(pendingValue)/math.log(2.0)) |
|
||||||
assert bitsUsed <= (WORD-bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD-bitsLeft, bitsUsed) |
|
||||||
|
|
||||||
if bitsLeft >= bitsPerValue: |
|
||||||
pendingValue += v << (WORD-bitsLeft) |
|
||||||
bitsLeft -= bitsPerValue |
|
||||||
if bitsLeft == 0: |
|
||||||
packed.append(pendingValue) |
|
||||||
bitsLeft = WORD |
|
||||||
pendingValue = 0 |
|
||||||
else: |
|
||||||
# split |
|
||||||
|
|
||||||
# bottom bitsLeft go in current word: |
|
||||||
pendingValue += (v & MASKS[bitsLeft-1]) << (WORD-bitsLeft) |
|
||||||
packed.append(pendingValue) |
|
||||||
|
|
||||||
pendingValue = v >> bitsLeft |
|
||||||
bitsLeft = WORD - (bitsPerValue-bitsLeft) |
|
||||||
|
|
||||||
if bitsLeft < WORD: |
|
||||||
packed.append(pendingValue) |
|
||||||
|
|
||||||
# verify(l, packed, bitsPerValue) |
|
||||||
|
|
||||||
return packed, bitsPerValue |
|
||||||
|
|
||||||
def verify(data, packedData, bitsPerValue): |
|
||||||
for i in range(len(data)): |
|
||||||
assert data[i] == unpack(packedData, i, bitsPerValue) |
|
||||||
|
|
||||||
def unpack(data, index, bitsPerValue): |
|
||||||
bitLoc = bitsPerValue * index |
|
||||||
dataLoc = int(bitLoc >> LOG2_WORD) |
|
||||||
bitStart = int(bitLoc & (WORD-1)) |
|
||||||
if bitStart + bitsPerValue <= WORD: |
|
||||||
# not split |
|
||||||
return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1])) |
|
||||||
else: |
|
||||||
# split |
|
||||||
part = WORD-bitStart; |
|
||||||
return int((((data[dataLoc] >> bitStart) & MASKS[part-1]) + |
|
||||||
((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part))) |
|
||||||
|
|
||||||
if __name__ == '__main__': |
|
||||||
if not __debug__: |
|
||||||
print |
|
||||||
print 'ERROR: please run without -O' |
|
||||||
print |
|
||||||
sys.exit(1) |
|
||||||
main() |
|
@ -1,335 +0,0 @@ |
|||||||
#! /usr/bin/env python |
|
||||||
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
from fractions import gcd |
|
||||||
|
|
||||||
"""Code generation for bulk operations""" |
|
||||||
|
|
||||||
MAX_SPECIALIZED_BITS_PER_VALUE = 24; |
|
||||||
PACKED_64_SINGLE_BLOCK_BPV = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] |
|
||||||
OUTPUT_FILE = "BulkOperation.java" |
|
||||||
HEADER = """// This file has been automatically generated, DO NOT EDIT |
|
||||||
|
|
||||||
package com.fr.third.org.apache.lucene.util.packed; |
|
||||||
|
|
||||||
/* |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with |
|
||||||
* this work for additional information regarding copyright ownership. |
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
* (the "License"); you may not use this file except in compliance with |
|
||||||
* the License. You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
* See the License for the specific language governing permissions and |
|
||||||
* limitations under the License. |
|
||||||
*/ |
|
||||||
|
|
||||||
""" |
|
||||||
|
|
||||||
FOOTER=""" |
|
||||||
protected int writeLong(long block, byte[] blocks, int blocksOffset) { |
|
||||||
for (int j = 1; j <= 8; ++j) { |
|
||||||
blocks[blocksOffset++] = (byte) (block >>> (64 - (j << 3))); |
|
||||||
} |
|
||||||
return blocksOffset; |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* For every number of bits per value, there is a minimum number of |
|
||||||
* blocks (b) / values (v) you need to write in order to reach the next block |
|
||||||
* boundary: |
|
||||||
* - 16 bits per value -> b=1, v=4 |
|
||||||
* - 24 bits per value -> b=3, v=8 |
|
||||||
* - 50 bits per value -> b=25, v=32 |
|
||||||
* - 63 bits per value -> b=63, v=64 |
|
||||||
* - ... |
|
||||||
* |
|
||||||
* A bulk read consists in copying <code>iterations*v</code> values that are |
|
||||||
* contained in <code>iterations*b</code> blocks into a <code>long[]</code> |
|
||||||
* (higher values of <code>iterations</code> are likely to yield a better |
|
||||||
* throughput) => this requires n * (b + v) longs in memory. |
|
||||||
* |
|
||||||
* This method computes <code>iterations</code> as |
|
||||||
* <code>ramBudget / (8 * (b + v))</code> (since a long is 8 bytes). |
|
||||||
*/ |
|
||||||
public final int computeIterations(int valueCount, int ramBudget) { |
|
||||||
final int iterations = (ramBudget >>> 3) / (blockCount() + valueCount()); |
|
||||||
if (iterations == 0) { |
|
||||||
// at least 1 |
|
||||||
return 1; |
|
||||||
} else if ((iterations - 1) * blockCount() >= valueCount) { |
|
||||||
// don't allocate for more than the size of the reader |
|
||||||
return (int) Math.ceil((double) valueCount / valueCount()); |
|
||||||
} else { |
|
||||||
return iterations; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
""" |
|
||||||
|
|
||||||
def is_power_of_two(n): |
|
||||||
return n & (n - 1) == 0 |
|
||||||
|
|
||||||
def casts(typ): |
|
||||||
cast_start = "(%s) (" %typ |
|
||||||
cast_end = ")" |
|
||||||
if typ == "long": |
|
||||||
cast_start = "" |
|
||||||
cast_end = "" |
|
||||||
return cast_start, cast_end |
|
||||||
|
|
||||||
def hexNoLSuffix(n): |
|
||||||
# On 32 bit Python values > (1 << 31)-1 will have L appended by hex function: |
|
||||||
s = hex(n) |
|
||||||
if s.endswith('L'): |
|
||||||
s = s[:-1] |
|
||||||
return s |
|
||||||
|
|
||||||
def masks(bits): |
|
||||||
if bits == 64: |
|
||||||
return "", "" |
|
||||||
return "(", " & %sL)" %(hexNoLSuffix((1 << bits) - 1)) |
|
||||||
|
|
||||||
def get_type(bits): |
|
||||||
if bits == 8: |
|
||||||
return "byte" |
|
||||||
elif bits == 16: |
|
||||||
return "short" |
|
||||||
elif bits == 32: |
|
||||||
return "int" |
|
||||||
elif bits == 64: |
|
||||||
return "long" |
|
||||||
else: |
|
||||||
assert False |
|
||||||
|
|
||||||
def block_value_count(bpv, bits=64): |
|
||||||
blocks = bpv |
|
||||||
values = blocks * bits / bpv |
|
||||||
while blocks % 2 == 0 and values % 2 == 0: |
|
||||||
blocks /= 2 |
|
||||||
values /= 2 |
|
||||||
assert values * bpv == bits * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv) |
|
||||||
return (blocks, values) |
|
||||||
|
|
||||||
def packed64(bpv, f): |
|
||||||
blocks, values = block_value_count(bpv) |
|
||||||
mask = (1 << bpv) - 1 |
|
||||||
|
|
||||||
f.write("\n") |
|
||||||
f.write(" public BulkOperationPacked%d() {\n" %bpv) |
|
||||||
f.write(" super(%d);\n" %bpv) |
|
||||||
f.write(" assert blockCount() == %d;\n" %blocks) |
|
||||||
f.write(" assert valueCount() == %d;\n" %values) |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
if bpv == 64: |
|
||||||
f.write(""" @Override |
|
||||||
public void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) { |
|
||||||
System.arraycopy(blocks, blocksOffset, values, valuesOffset, valueCount() * iterations); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) { |
|
||||||
throw new UnsupportedOperationException(); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) { |
|
||||||
throw new UnsupportedOperationException(); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) { |
|
||||||
LongBuffer.wrap(values, valuesOffset, iterations * valueCount()).put(ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer()); |
|
||||||
} |
|
||||||
""") |
|
||||||
else: |
|
||||||
p64_decode(bpv, f, 32) |
|
||||||
p64_decode(bpv, f, 64) |
|
||||||
|
|
||||||
def p64_decode(bpv, f, bits): |
|
||||||
blocks, values = block_value_count(bpv) |
|
||||||
typ = get_type(bits) |
|
||||||
cast_start, cast_end = casts(typ) |
|
||||||
|
|
||||||
f.write(" @Override\n") |
|
||||||
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ) |
|
||||||
if bits < bpv: |
|
||||||
f.write(" throw new UnsupportedOperationException();\n") |
|
||||||
else: |
|
||||||
f.write(" for (int i = 0; i < iterations; ++i) {\n") |
|
||||||
mask = (1 << bpv) - 1 |
|
||||||
|
|
||||||
if is_power_of_two(bpv): |
|
||||||
f.write(" final long block = blocks[blocksOffset++];\n") |
|
||||||
f.write(" for (int shift = %d; shift >= 0; shift -= %d) {\n" %(64 - bpv, bpv)) |
|
||||||
f.write(" values[valuesOffset++] = %s(block >>> shift) & %d%s;\n" %(cast_start, mask, cast_end)) |
|
||||||
f.write(" }\n") |
|
||||||
else: |
|
||||||
for i in xrange(0, values): |
|
||||||
block_offset = i * bpv / 64 |
|
||||||
bit_offset = (i * bpv) % 64 |
|
||||||
if bit_offset == 0: |
|
||||||
# start of block |
|
||||||
f.write(" final long block%d = blocks[blocksOffset++];\n" %block_offset); |
|
||||||
f.write(" values[valuesOffset++] = %sblock%d >>> %d%s;\n" %(cast_start, block_offset, 64 - bpv, cast_end)) |
|
||||||
elif bit_offset + bpv == 64: |
|
||||||
# end of block |
|
||||||
f.write(" values[valuesOffset++] = %sblock%d & %dL%s;\n" %(cast_start, block_offset, mask, cast_end)) |
|
||||||
elif bit_offset + bpv < 64: |
|
||||||
# middle of block |
|
||||||
f.write(" values[valuesOffset++] = %s(block%d >>> %d) & %dL%s;\n" %(cast_start, block_offset, 64 - bit_offset - bpv, mask, cast_end)) |
|
||||||
else: |
|
||||||
# value spans across 2 blocks |
|
||||||
mask1 = (1 << (64 - bit_offset)) -1 |
|
||||||
shift1 = bit_offset + bpv - 64 |
|
||||||
shift2 = 64 - shift1 |
|
||||||
f.write(" final long block%d = blocks[blocksOffset++];\n" %(block_offset + 1)); |
|
||||||
f.write(" values[valuesOffset++] = %s((block%d & %dL) << %d) | (block%d >>> %d)%s;\n" %(cast_start, block_offset, mask1, shift1, block_offset + 1, shift2, cast_end)) |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
byte_blocks, byte_values = block_value_count(bpv, 8) |
|
||||||
|
|
||||||
f.write(" @Override\n") |
|
||||||
f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ) |
|
||||||
if bits < bpv: |
|
||||||
f.write(" throw new UnsupportedOperationException();\n") |
|
||||||
else: |
|
||||||
|
|
||||||
if is_power_of_two(bpv) and bpv < 8: |
|
||||||
f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n") |
|
||||||
f.write(" final byte block = blocks[blocksOffset++];\n") |
|
||||||
for shift in xrange(8 - bpv, 0, -bpv): |
|
||||||
f.write(" values[valuesOffset++] = (block >>> %d) & %d;\n" %(shift, mask)) |
|
||||||
f.write(" values[valuesOffset++] = block & %d;\n" %mask) |
|
||||||
f.write(" }\n") |
|
||||||
elif bpv == 8: |
|
||||||
f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n") |
|
||||||
f.write(" values[valuesOffset++] = blocks[blocksOffset++] & 0xFF;\n") |
|
||||||
f.write(" }\n") |
|
||||||
elif is_power_of_two(bpv) and bpv > 8: |
|
||||||
f.write(" for (int j = 0; j < %d * iterations; ++j) {\n" %(64 / bpv)) |
|
||||||
m = bits <= 32 and "0xFF" or "0xFFL" |
|
||||||
f.write(" values[valuesOffset++] =") |
|
||||||
for i in xrange(bpv / 8 - 1): |
|
||||||
f.write(" ((blocks[blocksOffset++] & %s) << %d) |" %(m, bpv - 8)) |
|
||||||
f.write(" (blocks[blocksOffset++] & %s);\n" %m) |
|
||||||
f.write(" }\n") |
|
||||||
else: |
|
||||||
f.write(" for (int i = 0; i < 8 * iterations; ++i) {\n") |
|
||||||
for i in xrange(0, byte_values): |
|
||||||
byte_start = i * bpv / 8 |
|
||||||
bit_start = (i * bpv) % 8 |
|
||||||
byte_end = ((i + 1) * bpv - 1) / 8 |
|
||||||
bit_end = ((i + 1) * bpv - 1) % 8 |
|
||||||
shift = lambda b: 8 * (byte_end - b - 1) + 1 + bit_end |
|
||||||
if bit_start == 0: |
|
||||||
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, byte_start)) |
|
||||||
for b in xrange(byte_start + 1, byte_end + 1): |
|
||||||
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, b)) |
|
||||||
f.write(" values[valuesOffset++] =") |
|
||||||
if byte_start == byte_end: |
|
||||||
if bit_start == 0: |
|
||||||
if bit_end == 7: |
|
||||||
f.write(" byte%d" %byte_start) |
|
||||||
else: |
|
||||||
f.write(" byte%d >>> %d" %(byte_start, 7 - bit_end)) |
|
||||||
else: |
|
||||||
if bit_end == 7: |
|
||||||
f.write(" byte%d & %d" %(byte_start, 2 ** (8 - bit_start) - 1)) |
|
||||||
else: |
|
||||||
f.write(" (byte%d >>> %d) & %d" %(byte_start, 7 - bit_end, 2 ** (bit_end - bit_start + 1) - 1)) |
|
||||||
else: |
|
||||||
if bit_start == 0: |
|
||||||
f.write(" (byte%d << %d)" %(byte_start, shift(byte_start))) |
|
||||||
else: |
|
||||||
f.write(" ((byte%d & %d) << %d)" %(byte_start, 2 ** (8 - bit_start) - 1, shift(byte_start))) |
|
||||||
for b in xrange(byte_start + 1, byte_end): |
|
||||||
f.write(" | (byte%d << %d)" %(b, shift(b))) |
|
||||||
if bit_end == 7: |
|
||||||
f.write(" | byte%d" %byte_end) |
|
||||||
else: |
|
||||||
f.write(" | (byte%d >>> %d)" %(byte_end, 7 - bit_end)) |
|
||||||
f.write(";\n") |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
if __name__ == '__main__': |
|
||||||
f = open(OUTPUT_FILE, 'w') |
|
||||||
f.write(HEADER) |
|
||||||
f.write('\n') |
|
||||||
f.write('''/** |
|
||||||
* Efficient sequential read/write of packed integers. |
|
||||||
*/\n''') |
|
||||||
|
|
||||||
f.write('abstract class BulkOperation implements PackedInts.Decoder, PackedInts.Encoder {\n') |
|
||||||
f.write(' private static final BulkOperation[] packedBulkOps = new BulkOperation[] {\n') |
|
||||||
|
|
||||||
for bpv in xrange(1, 65): |
|
||||||
if bpv > MAX_SPECIALIZED_BITS_PER_VALUE: |
|
||||||
f.write(' new BulkOperationPacked(%d),\n' % bpv) |
|
||||||
continue |
|
||||||
f2 = open('BulkOperationPacked%d.java' % bpv, 'w') |
|
||||||
f2.write(HEADER) |
|
||||||
if bpv == 64: |
|
||||||
f2.write('import java.nio.LongBuffer;\n') |
|
||||||
f2.write('import java.nio.ByteBuffer;\n') |
|
||||||
f2.write('\n') |
|
||||||
f2.write('''/** |
|
||||||
* Efficient sequential read/write of packed integers. |
|
||||||
*/\n''') |
|
||||||
f2.write('final class BulkOperationPacked%d extends BulkOperationPacked {\n' % bpv) |
|
||||||
packed64(bpv, f2) |
|
||||||
f2.write('}\n') |
|
||||||
f2.close() |
|
||||||
f.write(' new BulkOperationPacked%d(),\n' % bpv) |
|
||||||
|
|
||||||
f.write(' };\n') |
|
||||||
f.write('\n') |
|
||||||
|
|
||||||
f.write(' // NOTE: this is sparse (some entries are null):\n') |
|
||||||
f.write(' private static final BulkOperation[] packedSingleBlockBulkOps = new BulkOperation[] {\n') |
|
||||||
for bpv in xrange(1, max(PACKED_64_SINGLE_BLOCK_BPV)+1): |
|
||||||
if bpv in PACKED_64_SINGLE_BLOCK_BPV: |
|
||||||
f.write(' new BulkOperationPackedSingleBlock(%d),\n' % bpv) |
|
||||||
else: |
|
||||||
f.write(' null,\n') |
|
||||||
f.write(' };\n') |
|
||||||
f.write('\n') |
|
||||||
|
|
||||||
f.write("\n") |
|
||||||
f.write(" public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {\n") |
|
||||||
f.write(" switch (format) {\n") |
|
||||||
|
|
||||||
f.write(" case PACKED:\n") |
|
||||||
f.write(" assert packedBulkOps[bitsPerValue - 1] != null;\n") |
|
||||||
f.write(" return packedBulkOps[bitsPerValue - 1];\n") |
|
||||||
f.write(" case PACKED_SINGLE_BLOCK:\n") |
|
||||||
f.write(" assert packedSingleBlockBulkOps[bitsPerValue - 1] != null;\n") |
|
||||||
f.write(" return packedSingleBlockBulkOps[bitsPerValue - 1];\n") |
|
||||||
f.write(" default:\n") |
|
||||||
f.write(" throw new AssertionError();\n") |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n") |
|
||||||
f.write(FOOTER) |
|
||||||
f.close() |
|
@ -1,175 +0,0 @@ |
|||||||
#! /usr/bin/env python |
|
||||||
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
HEADER="""// This file has been automatically generated, DO NOT EDIT |
|
||||||
|
|
||||||
package com.fr.third.org.apache.lucene.util.packed; |
|
||||||
|
|
||||||
/* |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with |
|
||||||
* this work for additional information regarding copyright ownership. |
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
* (the "License"); you may not use this file except in compliance with |
|
||||||
* the License. You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
* See the License for the specific language governing permissions and |
|
||||||
* limitations under the License. |
|
||||||
*/ |
|
||||||
|
|
||||||
import com.fr.third.org.apache.lucene.store.DataInput; |
|
||||||
import com.fr.third.org.apache.lucene.util.RamUsageEstimator; |
|
||||||
|
|
||||||
import java.io.IOException; |
|
||||||
import java.util.Arrays; |
|
||||||
|
|
||||||
""" |
|
||||||
|
|
||||||
TYPES = {8: "byte", 16: "short", 32: "int", 64: "long"} |
|
||||||
MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""} |
|
||||||
CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""} |
|
||||||
|
|
||||||
if __name__ == '__main__': |
|
||||||
for bpv in TYPES.keys(): |
|
||||||
type |
|
||||||
f = open("Direct%d.java" %bpv, 'w') |
|
||||||
f.write(HEADER) |
|
||||||
f.write("""/** |
|
||||||
* Direct wrapping of %d-bits values to a backing array. |
|
||||||
* @lucene.internal |
|
||||||
*/\n""" %bpv) |
|
||||||
f.write("final class Direct%d extends PackedInts.MutableImpl {\n" %bpv) |
|
||||||
f.write(" final %s[] values;\n\n" %TYPES[bpv]) |
|
||||||
|
|
||||||
f.write(" Direct%d(int valueCount) {\n" %bpv) |
|
||||||
f.write(" super(valueCount, %d);\n" %bpv) |
|
||||||
f.write(" values = new %s[valueCount];\n" %TYPES[bpv]) |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
f.write(" Direct%d(DataInput in, int valueCount) throws IOException {\n" %bpv) |
|
||||||
f.write(" this(valueCount);\n") |
|
||||||
f.write(" for (int i = 0; i < valueCount; ++i) {\n") |
|
||||||
f.write(" values[i] = in.read%s();\n" %TYPES[bpv].title()) |
|
||||||
f.write(" }\n") |
|
||||||
if bpv != 64: |
|
||||||
f.write(" final int mod = valueCount %% %d;\n" %(64 / bpv)) |
|
||||||
f.write(" if (mod != 0) {\n") |
|
||||||
f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv)) |
|
||||||
f.write(" in.read%s();\n" %TYPES[bpv].title()) |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n") |
|
||||||
|
|
||||||
f.write(""" |
|
||||||
@Override |
|
||||||
public long get(final int index) { |
|
||||||
return values[index]%s; |
|
||||||
} |
|
||||||
|
|
||||||
public void set(final int index, final long value) { |
|
||||||
values[index] = %s(value); |
|
||||||
} |
|
||||||
|
|
||||||
public long ramBytesUsed() { |
|
||||||
return RamUsageEstimator.sizeOf(values); |
|
||||||
} |
|
||||||
|
|
||||||
public void clear() { |
|
||||||
Arrays.fill(values, %s0L); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public Object getArray() { |
|
||||||
return values; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public boolean hasArray() { |
|
||||||
return true; |
|
||||||
} |
|
||||||
""" %(MASKS[bpv], CASTS[bpv], CASTS[bpv])) |
|
||||||
|
|
||||||
if bpv == 64: |
|
||||||
f.write(""" |
|
||||||
@Override |
|
||||||
public int get(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int gets = Math.min(valueCount - index, len); |
|
||||||
System.arraycopy(values, index, arr, off, gets); |
|
||||||
return gets; |
|
||||||
} |
|
||||||
|
|
||||||
public int set(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int sets = Math.min(valueCount - index, len); |
|
||||||
System.arraycopy(arr, off, values, index, sets); |
|
||||||
return sets; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void fill(int fromIndex, int toIndex, long val) { |
|
||||||
Arrays.fill(values, fromIndex, toIndex, val); |
|
||||||
} |
|
||||||
""") |
|
||||||
else: |
|
||||||
f.write(""" |
|
||||||
@Override |
|
||||||
public int get(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int gets = Math.min(valueCount - index, len); |
|
||||||
for (int i = index, o = off, end = index + gets; i < end; ++i, ++o) { |
|
||||||
arr[o] = values[i]%s; |
|
||||||
} |
|
||||||
return gets; |
|
||||||
} |
|
||||||
|
|
||||||
public int set(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int sets = Math.min(valueCount - index, len); |
|
||||||
for (int i = index, o = off, end = index + sets; i < end; ++i, ++o) { |
|
||||||
values[i] = %sarr[o]; |
|
||||||
} |
|
||||||
return sets; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void fill(int fromIndex, int toIndex, long val) { |
|
||||||
assert val == (val%s); |
|
||||||
Arrays.fill(values, fromIndex, toIndex, %sval); |
|
||||||
} |
|
||||||
""" %(MASKS[bpv], CASTS[bpv], MASKS[bpv], CASTS[bpv])) |
|
||||||
|
|
||||||
f.write("}\n") |
|
||||||
|
|
||||||
f.close() |
|
@ -1,291 +0,0 @@ |
|||||||
#! /usr/bin/env python |
|
||||||
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
SUPPORTED_BITS_PER_VALUE = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] |
|
||||||
|
|
||||||
HEADER="""// This file has been automatically generated, DO NOT EDIT |
|
||||||
|
|
||||||
package com.fr.third.org.apache.lucene.util.packed; |
|
||||||
|
|
||||||
/* |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with this |
|
||||||
* work for additional information regarding copyright ownership. The ASF |
|
||||||
* licenses this file to You under the Apache License, Version 2.0 (the |
|
||||||
* "License"); you may not use this file except in compliance with the License. |
|
||||||
* You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
||||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
||||||
* License for the specific language governing permissions and limitations under |
|
||||||
* the License. |
|
||||||
*/ |
|
||||||
|
|
||||||
import java.io.IOException; |
|
||||||
import java.util.Arrays; |
|
||||||
|
|
||||||
import com.fr.third.org.apache.lucene.store.DataInput; |
|
||||||
import com.fr.third.org.apache.lucene.util.RamUsageEstimator; |
|
||||||
|
|
||||||
/** |
|
||||||
* This class is similar to {@link Packed64} except that it trades space for |
|
||||||
* speed by ensuring that a single block needs to be read/written in order to |
|
||||||
* read/write a value. |
|
||||||
*/ |
|
||||||
abstract class Packed64SingleBlock extends PackedInts.MutableImpl { |
|
||||||
|
|
||||||
public static final int MAX_SUPPORTED_BITS_PER_VALUE = %d; |
|
||||||
private static final int[] SUPPORTED_BITS_PER_VALUE = new int[] {%s}; |
|
||||||
|
|
||||||
public static boolean isSupported(int bitsPerValue) { |
|
||||||
return Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) >= 0; |
|
||||||
} |
|
||||||
|
|
||||||
private static int requiredCapacity(int valueCount, int valuesPerBlock) { |
|
||||||
return valueCount / valuesPerBlock |
|
||||||
+ (valueCount %% valuesPerBlock == 0 ? 0 : 1); |
|
||||||
} |
|
||||||
|
|
||||||
final long[] blocks; |
|
||||||
|
|
||||||
Packed64SingleBlock(int valueCount, int bitsPerValue) { |
|
||||||
super(valueCount, bitsPerValue); |
|
||||||
assert isSupported(bitsPerValue); |
|
||||||
final int valuesPerBlock = 64 / bitsPerValue; |
|
||||||
blocks = new long[requiredCapacity(valueCount, valuesPerBlock)]; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void clear() { |
|
||||||
Arrays.fill(blocks, 0L); |
|
||||||
} |
|
||||||
|
|
||||||
public long ramBytesUsed() { |
|
||||||
return RamUsageEstimator.sizeOf(blocks); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public int get(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
len = Math.min(len, valueCount - index); |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int originalIndex = index; |
|
||||||
|
|
||||||
// go to the next block boundary |
|
||||||
final int valuesPerBlock = 64 / bitsPerValue; |
|
||||||
final int offsetInBlock = index %% valuesPerBlock; |
|
||||||
if (offsetInBlock != 0) { |
|
||||||
for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) { |
|
||||||
arr[off++] = get(index++); |
|
||||||
--len; |
|
||||||
} |
|
||||||
if (len == 0) { |
|
||||||
return index - originalIndex; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// bulk get |
|
||||||
assert index %% valuesPerBlock == 0; |
|
||||||
final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); |
|
||||||
assert decoder.blockCount() == 1; |
|
||||||
assert decoder.valueCount() == valuesPerBlock; |
|
||||||
final int blockIndex = index / valuesPerBlock; |
|
||||||
final int nblocks = (index + len) / valuesPerBlock - blockIndex; |
|
||||||
decoder.decode(blocks, blockIndex, arr, off, nblocks); |
|
||||||
final int diff = nblocks * valuesPerBlock; |
|
||||||
index += diff; len -= diff; |
|
||||||
|
|
||||||
if (index > originalIndex) { |
|
||||||
// stay at the block boundary |
|
||||||
return index - originalIndex; |
|
||||||
} else { |
|
||||||
// no progress so far => already at a block boundary but no full block to |
|
||||||
// get |
|
||||||
assert index == originalIndex; |
|
||||||
return super.get(index, arr, off, len); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public int set(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
len = Math.min(len, valueCount - index); |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int originalIndex = index; |
|
||||||
|
|
||||||
// go to the next block boundary |
|
||||||
final int valuesPerBlock = 64 / bitsPerValue; |
|
||||||
final int offsetInBlock = index %% valuesPerBlock; |
|
||||||
if (offsetInBlock != 0) { |
|
||||||
for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) { |
|
||||||
set(index++, arr[off++]); |
|
||||||
--len; |
|
||||||
} |
|
||||||
if (len == 0) { |
|
||||||
return index - originalIndex; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// bulk set |
|
||||||
assert index %% valuesPerBlock == 0; |
|
||||||
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); |
|
||||||
assert op.blockCount() == 1; |
|
||||||
assert op.valueCount() == valuesPerBlock; |
|
||||||
final int blockIndex = index / valuesPerBlock; |
|
||||||
final int nblocks = (index + len) / valuesPerBlock - blockIndex; |
|
||||||
op.encode(arr, off, blocks, blockIndex, nblocks); |
|
||||||
final int diff = nblocks * valuesPerBlock; |
|
||||||
index += diff; len -= diff; |
|
||||||
|
|
||||||
if (index > originalIndex) { |
|
||||||
// stay at the block boundary |
|
||||||
return index - originalIndex; |
|
||||||
} else { |
|
||||||
// no progress so far => already at a block boundary but no full block to |
|
||||||
// set |
|
||||||
assert index == originalIndex; |
|
||||||
return super.set(index, arr, off, len); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void fill(int fromIndex, int toIndex, long val) { |
|
||||||
assert fromIndex >= 0; |
|
||||||
assert fromIndex <= toIndex; |
|
||||||
assert PackedInts.bitsRequired(val) <= bitsPerValue; |
|
||||||
|
|
||||||
final int valuesPerBlock = 64 / bitsPerValue; |
|
||||||
if (toIndex - fromIndex <= valuesPerBlock << 1) { |
|
||||||
// there needs to be at least one full block to set for the block |
|
||||||
// approach to be worth trying |
|
||||||
super.fill(fromIndex, toIndex, val); |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
// set values naively until the next block start |
|
||||||
int fromOffsetInBlock = fromIndex %% valuesPerBlock; |
|
||||||
if (fromOffsetInBlock != 0) { |
|
||||||
for (int i = fromOffsetInBlock; i < valuesPerBlock; ++i) { |
|
||||||
set(fromIndex++, val); |
|
||||||
} |
|
||||||
assert fromIndex %% valuesPerBlock == 0; |
|
||||||
} |
|
||||||
|
|
||||||
// bulk set of the inner blocks |
|
||||||
final int fromBlock = fromIndex / valuesPerBlock; |
|
||||||
final int toBlock = toIndex / valuesPerBlock; |
|
||||||
assert fromBlock * valuesPerBlock == fromIndex; |
|
||||||
|
|
||||||
long blockValue = 0L; |
|
||||||
for (int i = 0; i < valuesPerBlock; ++i) { |
|
||||||
blockValue = blockValue | (val << (i * bitsPerValue)); |
|
||||||
} |
|
||||||
Arrays.fill(blocks, fromBlock, toBlock, blockValue); |
|
||||||
|
|
||||||
// fill the gap |
|
||||||
for (int i = valuesPerBlock * toBlock; i < toIndex; ++i) { |
|
||||||
set(i, val); |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
protected PackedInts.Format getFormat() { |
|
||||||
return PackedInts.Format.PACKED_SINGLE_BLOCK; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public String toString() { |
|
||||||
return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue |
|
||||||
+ ", size=" + size() + ", elements.length=" + blocks.length + ")"; |
|
||||||
} |
|
||||||
|
|
||||||
public static Packed64SingleBlock create(DataInput in, |
|
||||||
int valueCount, int bitsPerValue) throws IOException { |
|
||||||
Packed64SingleBlock reader = create(valueCount, bitsPerValue); |
|
||||||
for (int i = 0; i < reader.blocks.length; ++i) { |
|
||||||
reader.blocks[i] = in.readLong(); |
|
||||||
} |
|
||||||
return reader; |
|
||||||
} |
|
||||||
|
|
||||||
""" %(SUPPORTED_BITS_PER_VALUE[-1], ", ".join(map(str, SUPPORTED_BITS_PER_VALUE))) |
|
||||||
|
|
||||||
FOOTER = "}" |
|
||||||
|
|
||||||
if __name__ == '__main__': |
|
||||||
|
|
||||||
f = open("Packed64SingleBlock.java", 'w') |
|
||||||
f.write(HEADER) |
|
||||||
f.write(" public static Packed64SingleBlock create(int valueCount, int bitsPerValue) {\n") |
|
||||||
f.write(" switch (bitsPerValue) {\n") |
|
||||||
for bpv in SUPPORTED_BITS_PER_VALUE: |
|
||||||
f.write(" case %d:\n" %bpv) |
|
||||||
f.write(" return new Packed64SingleBlock%d(valueCount);\n" %bpv) |
|
||||||
f.write(" default:\n") |
|
||||||
f.write(" throw new IllegalArgumentException(\"Unsupported number of bits per value: \" + %d);\n" %bpv) |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
for bpv in SUPPORTED_BITS_PER_VALUE: |
|
||||||
log_2 = 0 |
|
||||||
while (1 << log_2) < bpv: |
|
||||||
log_2 = log_2 + 1 |
|
||||||
if (1 << log_2) != bpv: |
|
||||||
log_2 = None |
|
||||||
|
|
||||||
f.write(" static class Packed64SingleBlock%d extends Packed64SingleBlock {\n\n" %bpv) |
|
||||||
|
|
||||||
f.write(" Packed64SingleBlock%d(int valueCount) {\n" %bpv) |
|
||||||
f.write(" super(valueCount, %d);\n" %bpv) |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
f.write(" @Override\n") |
|
||||||
f.write(" public long get(int index) {\n") |
|
||||||
if log_2 is not None: |
|
||||||
f.write(" final int o = index >>> %d;\n" %(6 - log_2)) |
|
||||||
f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1)) |
|
||||||
f.write(" final int shift = b << %d;\n" %log_2) |
|
||||||
else: |
|
||||||
f.write(" final int o = index / %d;\n" %(64 / bpv)) |
|
||||||
f.write(" final int b = index %% %d;\n" %(64 / bpv)) |
|
||||||
f.write(" final int shift = b * %d;\n" %bpv) |
|
||||||
f.write(" return (blocks[o] >>> shift) & %dL;\n" %((1 << bpv) - 1)) |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
f.write(" @Override\n") |
|
||||||
f.write(" public void set(int index, long value) {\n") |
|
||||||
if log_2 is not None: |
|
||||||
f.write(" final int o = index >>> %d;\n" %(6 - log_2)) |
|
||||||
f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1)) |
|
||||||
f.write(" final int shift = b << %d;\n" %log_2) |
|
||||||
else: |
|
||||||
f.write(" final int o = index / %d;\n" %(64 / bpv)) |
|
||||||
f.write(" final int b = index %% %d;\n" %(64 / bpv)) |
|
||||||
f.write(" final int shift = b * %d;\n" %bpv) |
|
||||||
f.write(" blocks[o] = (blocks[o] & ~(%dL << shift)) | (value << shift);\n" % ((1 << bpv) - 1)) |
|
||||||
f.write(" }\n\n") |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
f.write(FOOTER) |
|
||||||
f.close() |
|
@ -1,161 +0,0 @@ |
|||||||
#! /usr/bin/env python |
|
||||||
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
# contributor license agreements. See the NOTICE file distributed with |
|
||||||
# this work for additional information regarding copyright ownership. |
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
# (the "License"); you may not use this file except in compliance with |
|
||||||
# the License. You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
HEADER="""// This file has been automatically generated, DO NOT EDIT |
|
||||||
|
|
||||||
package com.fr.third.org.apache.lucene.util.packed; |
|
||||||
|
|
||||||
/* |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with |
|
||||||
* this work for additional information regarding copyright ownership. |
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
* (the "License"); you may not use this file except in compliance with |
|
||||||
* the License. You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
* See the License for the specific language governing permissions and |
|
||||||
* limitations under the License. |
|
||||||
*/ |
|
||||||
|
|
||||||
import com.fr.third.org.apache.lucene.store.DataInput; |
|
||||||
import com.fr.third.org.apache.lucene.util.RamUsageEstimator; |
|
||||||
|
|
||||||
import java.io.IOException; |
|
||||||
import java.util.Arrays; |
|
||||||
|
|
||||||
""" |
|
||||||
|
|
||||||
TYPES = {8: "byte", 16: "short"} |
|
||||||
MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""} |
|
||||||
CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""} |
|
||||||
|
|
||||||
if __name__ == '__main__': |
|
||||||
for bpv in TYPES.keys(): |
|
||||||
type |
|
||||||
f = open("Packed%dThreeBlocks.java" %bpv, 'w') |
|
||||||
f.write(HEADER) |
|
||||||
f.write("""/** |
|
||||||
* Packs integers into 3 %ss (%d bits per value). |
|
||||||
* @lucene.internal |
|
||||||
*/\n""" %(TYPES[bpv], bpv*3)) |
|
||||||
f.write("final class Packed%dThreeBlocks extends PackedInts.MutableImpl {\n" %bpv) |
|
||||||
f.write(" final %s[] blocks;\n\n" %TYPES[bpv]) |
|
||||||
|
|
||||||
f.write(" public static final int MAX_SIZE = Integer.MAX_VALUE / 3;\n\n") |
|
||||||
|
|
||||||
f.write(" Packed%dThreeBlocks(int valueCount) {\n" %bpv) |
|
||||||
f.write(" super(valueCount, %d);\n" %(bpv*3)) |
|
||||||
f.write(" if (valueCount > MAX_SIZE) {\n") |
|
||||||
f.write(" throw new ArrayIndexOutOfBoundsException(\"MAX_SIZE exceeded\");\n") |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" blocks = new %s[valueCount * 3];\n" %TYPES[bpv]) |
|
||||||
f.write(" }\n\n") |
|
||||||
|
|
||||||
f.write(" Packed%dThreeBlocks(DataInput in, int valueCount) throws IOException {\n" %bpv) |
|
||||||
f.write(" this(valueCount);\n") |
|
||||||
f.write(" for (int i = 0; i < 3 * valueCount; ++i) {\n") |
|
||||||
f.write(" blocks[i] = in.read%s();\n" %TYPES[bpv].title()) |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" final int mod = blocks.length %% %d;\n" %(64 / bpv)) |
|
||||||
f.write(" if (mod != 0) {\n") |
|
||||||
f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv)) |
|
||||||
f.write(" in.read%s();\n" %TYPES[bpv].title()) |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n") |
|
||||||
f.write(" }\n") |
|
||||||
|
|
||||||
f.write(""" |
|
||||||
@Override |
|
||||||
public long get(int index) { |
|
||||||
final int o = index * 3; |
|
||||||
return (blocks[o]%s) << %d | (blocks[o+1]%s) << %d | (blocks[o+2]%s); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public int get(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int gets = Math.min(valueCount - index, len); |
|
||||||
for (int i = index * 3, end = (index + gets) * 3; i < end; i+=3) { |
|
||||||
arr[off++] = (blocks[i]%s) << %d | (blocks[i+1]%s) << %d | (blocks[i+2]%s); |
|
||||||
} |
|
||||||
return gets; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void set(int index, long value) { |
|
||||||
final int o = index * 3; |
|
||||||
blocks[o] = %s(value >>> %d); |
|
||||||
blocks[o+1] = %s(value >>> %d); |
|
||||||
blocks[o+2] = %svalue; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public int set(int index, long[] arr, int off, int len) { |
|
||||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
|
||||||
assert index >= 0 && index < valueCount; |
|
||||||
assert off + len <= arr.length; |
|
||||||
|
|
||||||
final int sets = Math.min(valueCount - index, len); |
|
||||||
for (int i = off, o = index * 3, end = off + sets; i < end; ++i) { |
|
||||||
final long value = arr[i]; |
|
||||||
blocks[o++] = %s(value >>> %d); |
|
||||||
blocks[o++] = %s(value >>> %d); |
|
||||||
blocks[o++] = %svalue; |
|
||||||
} |
|
||||||
return sets; |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void fill(int fromIndex, int toIndex, long val) { |
|
||||||
final %s block1 = %s(val >>> %d); |
|
||||||
final %s block2 = %s(val >>> %d); |
|
||||||
final %s block3 = %sval; |
|
||||||
for (int i = fromIndex * 3, end = toIndex * 3; i < end; i += 3) { |
|
||||||
blocks[i] = block1; |
|
||||||
blocks[i+1] = block2; |
|
||||||
blocks[i+2] = block3; |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public void clear() { |
|
||||||
Arrays.fill(blocks, %s0); |
|
||||||
} |
|
||||||
|
|
||||||
public long ramBytesUsed() { |
|
||||||
return RamUsageEstimator.sizeOf(blocks); |
|
||||||
} |
|
||||||
|
|
||||||
@Override |
|
||||||
public String toString() { |
|
||||||
return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue |
|
||||||
+ ", size=" + size() + ", elements.length=" + blocks.length + ")"; |
|
||||||
} |
|
||||||
} |
|
||||||
""" %(MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], CASTS[bpv], 2*bpv, CASTS[bpv], bpv, CASTS[bpv], CASTS[bpv], |
|
||||||
2*bpv, CASTS[bpv], bpv, CASTS[bpv], TYPES[bpv], CASTS[bpv], 2*bpv, TYPES[bpv], |
|
||||||
CASTS[bpv], bpv, TYPES[bpv], CASTS[bpv], CASTS[bpv])) |
|
||||||
|
|
||||||
f.close() |
|
Loading…
Reference in new issue