Yuan.Wang
1 year ago
7 changed files with 0 additions and 2367 deletions
@ -1,539 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import re |
||||
|
||||
# A simple python script to generate an HTML entity map and a regex alternation |
||||
# for inclusion in HTMLStripCharFilter.jflex. |
||||
|
||||
def main(): |
||||
print get_apache_license() |
||||
codes = {} |
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"') |
||||
for line in get_entity_text().split('\n'): |
||||
match = regex.match(line) |
||||
if match: |
||||
key = match.group(1) |
||||
if key == 'quot': codes[key] = r'\"' |
||||
elif key == 'nbsp': codes[key] = ' '; |
||||
else : codes[key] = r'\u%04X' % int(match.group(2)) |
||||
|
||||
keys = sorted(codes) |
||||
|
||||
first_entry = True |
||||
output_line = 'CharacterEntities = ( ' |
||||
for key in keys: |
||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key |
||||
first_entry = False |
||||
if len(output_line) + len(new_entry) >= 80: |
||||
print output_line |
||||
output_line = ' ' |
||||
output_line += new_entry |
||||
if key in ('quot','copy','gt','lt','reg','amp'): |
||||
new_entry = ' | "%s"' % key.upper() |
||||
if len(output_line) + len(new_entry) >= 80: |
||||
print output_line |
||||
output_line = ' ' |
||||
output_line += new_entry |
||||
print output_line, ')' |
||||
|
||||
print '%{' |
||||
print ' private static final Map<String,String> upperCaseVariantsAccepted' |
||||
print ' = new HashMap<String,String>();' |
||||
print ' static {' |
||||
print ' upperCaseVariantsAccepted.put("quot", "QUOT");' |
||||
print ' upperCaseVariantsAccepted.put("copy", "COPY");' |
||||
print ' upperCaseVariantsAccepted.put("gt", "GT");' |
||||
print ' upperCaseVariantsAccepted.put("lt", "LT");' |
||||
print ' upperCaseVariantsAccepted.put("reg", "REG");' |
||||
print ' upperCaseVariantsAccepted.put("amp", "AMP");' |
||||
print ' }' |
||||
print ' private static final CharArrayMap<Character> entityValues' |
||||
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys) |
||||
print ' static {' |
||||
print ' String[] entities = {' |
||||
output_line = ' ' |
||||
for key in keys: |
||||
new_entry = ' "%s", "%s",' % (key, codes[key]) |
||||
if len(output_line) + len(new_entry) >= 80: |
||||
print output_line |
||||
output_line = ' ' |
||||
output_line += new_entry |
||||
print output_line[:-1] |
||||
print ' };' |
||||
print ' for (int i = 0 ; i < entities.length ; i += 2) {' |
||||
print ' Character value = entities[i + 1].charAt(0);' |
||||
print ' entityValues.put(entities[i], value);' |
||||
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);' |
||||
print ' if (upperCaseVariant != null) {' |
||||
print ' entityValues.put(upperCaseVariant, value);' |
||||
print ' }' |
||||
print ' }' |
||||
print " }" |
||||
print "%}" |
||||
|
||||
def get_entity_text(): |
||||
# The text below is taken verbatim from |
||||
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>: |
||||
text = r""" |
||||
F.1. XHTML Character Entities |
||||
|
||||
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section. |
||||
F.1.1. XHTML Latin 1 Character Entities |
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent. |
||||
|
||||
<!-- ...................................................................... --> |
||||
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ --> |
||||
<!-- file: xhtml-lat1.ent |
||||
|
||||
Typical invocation: |
||||
|
||||
<!ENTITY % xhtml-lat1 |
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
||||
"xhtml-lat1.ent" > |
||||
%xhtml-lat1; |
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
||||
|
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent" |
||||
|
||||
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
||||
|
||||
Portions (C) International Organization for Standardization 1986: |
||||
Permission to copy in any form is granted for use with conforming |
||||
SGML systems and applications as defined in ISO 8879, provided |
||||
this notice is included in all copies. |
||||
--> |
||||
|
||||
<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum --> |
||||
<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum --> |
||||
<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum --> |
||||
<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum --> |
||||
<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum --> |
||||
<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum --> |
||||
<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum --> |
||||
<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum --> |
||||
<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia --> |
||||
<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum --> |
||||
<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum --> |
||||
<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum --> |
||||
<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum --> |
||||
<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum --> |
||||
<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum --> |
||||
<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia --> |
||||
<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum --> |
||||
<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum --> |
||||
<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum --> |
||||
<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum --> |
||||
<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia --> |
||||
<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum --> |
||||
<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum --> |
||||
<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum --> |
||||
<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia --> |
||||
<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum --> |
||||
<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum --> |
||||
<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum --> |
||||
<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum --> |
||||
<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum --> |
||||
<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum --> |
||||
<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum --> |
||||
<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 --> |
||||
<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 --> |
||||
<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 --> |
||||
<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 --> |
||||
<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 --> |
||||
<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 --> |
||||
<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 --> |
||||
<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 --> |
||||
<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 --> |
||||
<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 --> |
||||
<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 --> |
||||
<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 --> |
||||
<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 --> |
||||
<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 --> |
||||
<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 --> |
||||
<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 --> |
||||
<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 --> |
||||
<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 --> |
||||
<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 --> |
||||
<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 --> |
||||
<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 --> |
||||
<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 --> |
||||
<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 --> |
||||
<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum --> |
||||
<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 --> |
||||
<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 --> |
||||
<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 --> |
||||
<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 --> |
||||
<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 --> |
||||
<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 --> |
||||
<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 --> |
||||
<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 --> |
||||
<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 --> |
||||
<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 --> |
||||
<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 --> |
||||
<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 --> |
||||
<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 --> |
||||
<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 --> |
||||
<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 --> |
||||
<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 --> |
||||
<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 --> |
||||
<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 --> |
||||
<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 --> |
||||
<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 --> |
||||
<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 --> |
||||
<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 --> |
||||
<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 --> |
||||
<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 --> |
||||
<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 --> |
||||
<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 --> |
||||
<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 --> |
||||
<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 --> |
||||
<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 --> |
||||
<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 --> |
||||
<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 --> |
||||
<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum --> |
||||
<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 --> |
||||
<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 --> |
||||
<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 --> |
||||
<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 --> |
||||
<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 --> |
||||
<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 --> |
||||
<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 --> |
||||
<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 --> |
||||
<!-- end of xhtml-lat1.ent --> |
||||
|
||||
F.1.2. XHTML Special Characters |
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent. |
||||
|
||||
<!-- ...................................................................... --> |
||||
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ --> |
||||
<!-- file: xhtml-special.ent |
||||
|
||||
Typical invocation: |
||||
|
||||
<!ENTITY % xhtml-special |
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
||||
"xhtml-special.ent" > |
||||
%xhtml-special; |
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
||||
|
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent" |
||||
|
||||
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
||||
|
||||
Portions (C) International Organization for Standardization 1986: |
||||
Permission to copy in any form is granted for use with conforming |
||||
SGML systems and applications as defined in ISO 8879, provided |
||||
this notice is included in all copies. |
||||
|
||||
Revisions: |
||||
2000-10-28: added ' and altered XML Predefined Entities for compatibility |
||||
--> |
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced. |
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
||||
numbers are given for each character, in hex. Entity values are |
||||
decimal conversions of the ISO 10646 values and refer to the |
||||
document character set. Names are Unicode [UNICODE] names. |
||||
--> |
||||
|
||||
<!-- C0 Controls and Basic Latin --> |
||||
<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum --> |
||||
<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum --> |
||||
<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum --> |
||||
<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum --> |
||||
<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum --> |
||||
|
||||
<!-- Latin Extended-A --> |
||||
<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 --> |
||||
<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 --> |
||||
|
||||
<!-- ligature is a misnomer, this is a separate character in some languages --> |
||||
<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 --> |
||||
<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 --> |
||||
<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 --> |
||||
|
||||
<!-- Spacing Modifier Letters --> |
||||
<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub --> |
||||
<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia --> |
||||
|
||||
<!-- General Punctuation --> |
||||
<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub --> |
||||
<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub --> |
||||
<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub --> |
||||
<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 --> |
||||
<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 --> |
||||
<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 --> |
||||
<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 --> |
||||
<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub --> |
||||
<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub --> |
||||
<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum --> |
||||
<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum --> |
||||
<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW --> |
||||
<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum --> |
||||
<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum --> |
||||
<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW --> |
||||
<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub --> |
||||
<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub --> |
||||
<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech --> |
||||
|
||||
<!-- lsaquo is proposed but not yet ISO standardized --> |
||||
<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed --> |
||||
<!-- rsaquo is proposed but not yet ISO standardized --> |
||||
<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed --> |
||||
<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW --> |
||||
|
||||
<!-- end of xhtml-special.ent --> |
||||
|
||||
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters |
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent. |
||||
|
||||
<!-- ...................................................................... --> |
||||
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... --> |
||||
<!-- file: xhtml-symbol.ent |
||||
|
||||
Typical invocation: |
||||
|
||||
<!ENTITY % xhtml-symbol |
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
||||
"xhtml-symbol.ent" > |
||||
%xhtml-symbol; |
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
||||
|
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent" |
||||
|
||||
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
||||
|
||||
Portions (C) International Organization for Standardization 1986: |
||||
Permission to copy in any form is granted for use with conforming |
||||
SGML systems and applications as defined in ISO 8879, provided |
||||
this notice is included in all copies. |
||||
--> |
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced. |
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
||||
numbers are given for each character, in hex. Entity values are |
||||
decimal conversions of the ISO 10646 values and refer to the |
||||
document character set. Names are Unicode [UNICODE] names. |
||||
--> |
||||
|
||||
<!-- Latin Extended-B --> |
||||
<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function |
||||
= florin, U+0192 ISOtech --> |
||||
|
||||
<!-- Greek --> |
||||
<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 --> |
||||
<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 --> |
||||
<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 --> |
||||
<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 --> |
||||
<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 --> |
||||
<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 --> |
||||
<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 --> |
||||
<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 --> |
||||
<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 --> |
||||
<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A --> |
||||
<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 --> |
||||
<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C --> |
||||
<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D --> |
||||
<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 --> |
||||
<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F --> |
||||
<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 --> |
||||
<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 --> |
||||
<!-- there is no Sigmaf, and no U+03A2 character either --> |
||||
<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 --> |
||||
<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 --> |
||||
<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon, |
||||
U+03A5 ISOgrk3 --> |
||||
<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 --> |
||||
<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 --> |
||||
<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 --> |
||||
<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 --> |
||||
<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 --> |
||||
<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 --> |
||||
<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 --> |
||||
<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 --> |
||||
<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 --> |
||||
<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 --> |
||||
<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 --> |
||||
<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 --> |
||||
<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 --> |
||||
<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 --> |
||||
<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 --> |
||||
<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 --> |
||||
<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 --> |
||||
<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 --> |
||||
<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW --> |
||||
<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 --> |
||||
<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 --> |
||||
<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 --> |
||||
<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 --> |
||||
<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 --> |
||||
<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 --> |
||||
<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 --> |
||||
<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 --> |
||||
<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 --> |
||||
<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 --> |
||||
<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW --> |
||||
<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW --> |
||||
<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 --> |
||||
|
||||
<!-- General Punctuation --> |
||||
<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub --> |
||||
<!-- bullet is NOT the same as bullet operator, U+2219 --> |
||||
<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub --> |
||||
<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech --> |
||||
<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech --> |
||||
<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW --> |
||||
<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW --> |
||||
|
||||
<!-- Letterlike Symbols --> |
||||
<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso --> |
||||
<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso --> |
||||
<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso --> |
||||
<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum --> |
||||
<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW --> |
||||
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although |
||||
the same glyph could be used to depict both characters --> |
||||
|
||||
<!-- Arrows --> |
||||
<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum --> |
||||
<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum--> |
||||
<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum --> |
||||
<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum --> |
||||
<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa --> |
||||
<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards |
||||
= carriage return, U+21B5 NEW --> |
||||
<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech --> |
||||
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow |
||||
but also does not have any other character for that function. So ? lArr can |
||||
be used for 'is implied by' as ISOtech suggests --> |
||||
<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa --> |
||||
<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech --> |
||||
<!-- Unicode does not say this is the 'implies' character but does not have |
||||
another character with this function so ? |
||||
rArr can be used for 'implies' as ISOtech suggests --> |
||||
<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa --> |
||||
<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa --> |
||||
|
||||
<!-- Mathematical Operators --> |
||||
<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech --> |
||||
<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech --> |
||||
<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech --> |
||||
<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso --> |
||||
<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech --> |
||||
<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech --> |
||||
<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech --> |
||||
<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech --> |
||||
<!-- should there be a more memorable name than 'ni'? --> |
||||
<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb --> |
||||
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though |
||||
the same glyph might be used for both --> |
||||
<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb --> |
||||
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' |
||||
though the same glyph might be used for both --> |
||||
<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech --> |
||||
<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech --> |
||||
<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech --> |
||||
<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech --> |
||||
<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech --> |
||||
<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso --> |
||||
<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech --> |
||||
<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech --> |
||||
<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech --> |
||||
<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech --> |
||||
<!ENTITY int "∫" ><!-- integral, U+222B ISOtech --> |
||||
<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech --> |
||||
<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech --> |
||||
<!-- tilde operator is NOT the same character as the tilde, U+007E, |
||||
although the same glyph might be used to represent both --> |
||||
<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech --> |
||||
<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr --> |
||||
<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech --> |
||||
<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech --> |
||||
<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech --> |
||||
<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech --> |
||||
<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech --> |
||||
<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech --> |
||||
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol |
||||
font encoding and is not included. Should it be, for symmetry? |
||||
It is in ISOamsn --> |
||||
<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn --> |
||||
<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech --> |
||||
<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech --> |
||||
<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb --> |
||||
<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb --> |
||||
<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech --> |
||||
<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb --> |
||||
<!-- dot operator is NOT the same character as U+00B7 middle dot --> |
||||
|
||||
<!-- Miscellaneous Technical --> |
||||
<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc --> |
||||
<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc --> |
||||
<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc --> |
||||
<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc --> |
||||
<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech --> |
||||
<!-- lang is NOT the same character as U+003C 'less than' |
||||
or U+2039 'single left-pointing angle quotation mark' --> |
||||
<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech --> |
||||
<!-- rang is NOT the same character as U+003E 'greater than' |
||||
or U+203A 'single right-pointing angle quotation mark' --> |
||||
|
||||
<!-- Geometric Shapes --> |
||||
<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub --> |
||||
|
||||
<!-- Miscellaneous Symbols --> |
||||
<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub --> |
||||
<!-- black here seems to mean filled as opposed to hollow --> |
||||
<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub --> |
||||
<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub --> |
||||
<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub --> |
||||
|
||||
<!-- end of xhtml-symbol.ent --> |
||||
""" |
||||
return text |
||||
|
||||
def get_apache_license(): |
||||
license = r"""/** |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
""" |
||||
return license |
||||
|
||||
main() |
@ -1,366 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import types |
||||
import os |
||||
import sys |
||||
import random |
||||
|
||||
MAX_UNICODE = 0x10FFFF |
||||
|
||||
# TODO |
||||
# - could be more minimal |
||||
# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges |
||||
# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does |
||||
|
||||
# MASKS[0] is bottom 1-bit |
||||
# MASKS[1] is bottom 2-bits |
||||
# ... |
||||
|
||||
utf8Ranges = [(0, 127), |
||||
(128, 2047), |
||||
(2048, 65535), |
||||
(65536, 1114111)] |
||||
|
||||
typeToColor = {'startend': 'purple', |
||||
'start': 'blue', |
||||
'end': 'red'} |
||||
|
||||
class FSA: |
||||
|
||||
def __init__(self): |
||||
# maps fromNode -> (startUTF8, endUTF8, endNode) |
||||
self.states = {} |
||||
self.nodeUpto = 0 |
||||
|
||||
def run(self, bytes): |
||||
state = self.start |
||||
for b in bytes: |
||||
found = False |
||||
oldState = state |
||||
for label, s, e, n in self.states[state][1:]: |
||||
if b >= s and b <= e: |
||||
if found: |
||||
raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b)) |
||||
state = n |
||||
found = True |
||||
if not found: |
||||
return -1 |
||||
|
||||
return state |
||||
|
||||
def addEdge(self, n1, n2, v1, v2, label): |
||||
""" |
||||
Adds edge from n1-n2, utf8 byte range v1-v2. |
||||
""" |
||||
assert n1 in self.states |
||||
assert type(v1) is types.IntType |
||||
assert type(v2) is types.IntType |
||||
self.states[n1].append((label, v1, v2, n2)) |
||||
|
||||
def addNode(self, label=None): |
||||
try: |
||||
self.states[self.nodeUpto] = [label] |
||||
return self.nodeUpto |
||||
finally: |
||||
self.nodeUpto += 1 |
||||
|
||||
def toDOT(self, label): |
||||
__l = [] |
||||
w = __l.append |
||||
endNode = startNode = None |
||||
for id, details in self.states.items(): |
||||
name = details[0] |
||||
if name == 'end': |
||||
endNode = id |
||||
elif name == 'start': |
||||
startNode = id |
||||
|
||||
w('digraph %s {' % label) |
||||
w(' rankdir=LR;') |
||||
w(' size="8,5";') |
||||
w(' node [color=white label=""]; Ns;') |
||||
|
||||
w(' node [color=black];') |
||||
w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode)) |
||||
w(' node [shape=circle];') |
||||
|
||||
w(' N%s [label="%s"];' % (startNode, startNode)) |
||||
w(' Ns -> N%s;' % startNode) |
||||
for id, details in self.states.items(): |
||||
edges = details[1:] |
||||
w(' N%s [label="%s"];' % (id, id)) |
||||
for type, s, e, dest in edges: |
||||
c = typeToColor.get(type, 'black') |
||||
if type == 'all*': |
||||
# special case -- matches any utf8 byte at this point |
||||
label = '*' |
||||
elif s == e: |
||||
label = '%s' % binary(s) |
||||
else: |
||||
label = '%s-%s' % (binary(s), binary(e)) |
||||
w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c)) |
||||
if name == 'end': |
||||
endNode = id |
||||
elif name == 'start': |
||||
startNode = id |
||||
w('}') |
||||
return '\n'.join(__l) |
||||
|
||||
def toPNG(self, label, pngOut): |
||||
open('tmp.dot', 'wb').write(self.toDOT(label)) |
||||
if os.system('dot -Tpng tmp.dot -o %s' % pngOut): |
||||
raise RuntimeException('dot failed') |
||||
|
||||
|
||||
MASKS = [] |
||||
v = 2 |
||||
for i in range(32): |
||||
MASKS.append(v-1) |
||||
v *= 2 |
||||
|
||||
def binary(x): |
||||
if x == 0: |
||||
return '00000000' |
||||
|
||||
l = [] |
||||
while x > 0: |
||||
if x & 1 == 1: |
||||
l.append('1') |
||||
else: |
||||
l.append('0') |
||||
x = x >> 1 |
||||
|
||||
# big endian! |
||||
l.reverse() |
||||
|
||||
l2 = [] |
||||
while len(l) > 0: |
||||
s = ''.join(l[-8:]) |
||||
if len(s) < 8: |
||||
s = '0'*(8-len(s)) + s |
||||
l2.append(s) |
||||
del l[-8:] |
||||
|
||||
return ' '.join(l2) |
||||
|
||||
def getUTF8Rest(code, numBytes): |
||||
l = [] |
||||
for i in range(numBytes): |
||||
l.append((128 | (code & MASKS[5]), 6)) |
||||
code = code >> 6 |
||||
l.reverse() |
||||
return tuple(l) |
||||
|
||||
def toUTF8(code): |
||||
# code = Unicode code point |
||||
assert code >= 0 |
||||
assert code <= MAX_UNICODE |
||||
|
||||
if code < 128: |
||||
# 0xxxxxxx |
||||
bytes = ((code, 7),) |
||||
elif code < 2048: |
||||
# 110yyyxx 10xxxxxx |
||||
byte1 = (6 << 5) | (code >> 6) |
||||
bytes = ((byte1, 5),) + getUTF8Rest(code, 1) |
||||
elif code < 65536: |
||||
# 1110yyyy 10yyyyxx 10xxxxxx |
||||
len = 3 |
||||
byte1 = (14 << 4) | (code >> 12) |
||||
bytes = ((byte1, 4),) + getUTF8Rest(code, 2) |
||||
else: |
||||
# 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
||||
len = 4 |
||||
byte1 = (30 << 3) | (code >> 18) |
||||
bytes = ((byte1, 3),) + getUTF8Rest(code, 3) |
||||
|
||||
return bytes |
||||
|
||||
def all(fsa, startNode, endNode, startCode, endCode, left): |
||||
if len(left) == 0: |
||||
fsa.addEdge(startNode, endNode, startCode, endCode, 'all') |
||||
else: |
||||
lastN = fsa.addNode() |
||||
fsa.addEdge(startNode, lastN, startCode, endCode, 'all') |
||||
while len(left) > 1: |
||||
n = fsa.addNode() |
||||
fsa.addEdge(lastN, n, 128, 191, 'all*') |
||||
left = left[1:] |
||||
lastN = n |
||||
fsa.addEdge(lastN, endNode, 128, 191, 'all*') |
||||
|
||||
def start(fsa, startNode, endNode, utf8, doAll): |
||||
if len(utf8) == 1: |
||||
fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start') |
||||
else: |
||||
n = fsa.addNode() |
||||
fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start') |
||||
start(fsa, n, endNode, utf8[1:], True) |
||||
end = utf8[0][0] | MASKS[utf8[0][1]-1] |
||||
if doAll and utf8[0][0] != end: |
||||
all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:]) |
||||
|
||||
def end(fsa, startNode, endNode, utf8, doAll): |
||||
if len(utf8) == 1: |
||||
fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end') |
||||
else: |
||||
if utf8[0][1] == 5: |
||||
# special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences): |
||||
start = 194 |
||||
else: |
||||
start = utf8[0][0] & (~MASKS[utf8[0][1]-1]) |
||||
if doAll and utf8[0][0] != start: |
||||
all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:]) |
||||
n = fsa.addNode() |
||||
fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end') |
||||
end(fsa, n, endNode, utf8[1:], True) |
||||
|
||||
def build(fsa, |
||||
startNode, endNode, |
||||
startUTF8, endUTF8): |
||||
|
||||
# Break into start, middle, end: |
||||
if startUTF8[0][0] == endUTF8[0][0]: |
||||
# Degen case: lead with the same byte: |
||||
if len(startUTF8) == 1 and len(endUTF8) == 1: |
||||
fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') |
||||
return |
||||
else: |
||||
assert len(startUTF8) != 1 |
||||
assert len(endUTF8) != 1 |
||||
n = fsa.addNode() |
||||
# single value edge |
||||
fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single') |
||||
build(fsa, n, endNode, startUTF8[1:], endUTF8[1:]) |
||||
elif len(startUTF8) == len(endUTF8): |
||||
if len(startUTF8) == 1: |
||||
fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') |
||||
else: |
||||
start(fsa, startNode, endNode, startUTF8, False) |
||||
if endUTF8[0][0] - startUTF8[0][0] > 1: |
||||
all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:]) |
||||
end(fsa, startNode, endNode, endUTF8, False) |
||||
else: |
||||
# start |
||||
start(fsa, startNode, endNode, startUTF8, True) |
||||
|
||||
# possibly middle |
||||
byteCount = 1+len(startUTF8) |
||||
while byteCount < len(endUTF8): |
||||
s = toUTF8(utf8Ranges[byteCount-1][0]) |
||||
e = toUTF8(utf8Ranges[byteCount-1][1]) |
||||
all(fsa, startNode, endNode, |
||||
s[0][0], |
||||
e[0][0], |
||||
s[1:]) |
||||
byteCount += 1 |
||||
|
||||
# end |
||||
end(fsa, startNode, endNode, endUTF8, True) |
||||
|
||||
def main(): |
||||
|
||||
if len(sys.argv) not in (3, 4): |
||||
print |
||||
print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0] |
||||
print |
||||
sys.exit(1) |
||||
|
||||
utf32Start = int(sys.argv[1]) |
||||
utf32End = int(sys.argv[2]) |
||||
|
||||
if utf32Start > utf32End: |
||||
print 'ERROR: start must be <= end' |
||||
sys.exit(1) |
||||
|
||||
fsa = FSA() |
||||
fsa.start = fsa.addNode('start') |
||||
fsa.end = fsa.addNode('end') |
||||
|
||||
print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)]) |
||||
print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)]) |
||||
|
||||
if len(sys.argv) == 4: |
||||
print 't=%s [%s]' % \ |
||||
(' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]), |
||||
' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))])) |
||||
|
||||
build(fsa, fsa.start, fsa.end, |
||||
toUTF8(utf32Start), |
||||
toUTF8(utf32End)) |
||||
|
||||
fsa.toPNG('test', '/tmp/outpy.png') |
||||
print 'Saved to /tmp/outpy.png...' |
||||
|
||||
test(fsa, utf32Start, utf32End, 100000); |
||||
|
||||
def test(fsa, utf32Start, utf32End, count): |
||||
|
||||
# verify correct ints are accepted |
||||
for i in range(count): |
||||
r = random.randint(utf32Start, utf32End) |
||||
dest = fsa.run([tup[0] for tup in toUTF8(r)]) |
||||
if dest != fsa.end: |
||||
print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) |
||||
return False |
||||
|
||||
invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1) |
||||
if invalidRange >= 0: |
||||
# verify invalid ints are not accepted |
||||
for i in range(count): |
||||
r = random.randint(0, invalidRange-1) |
||||
if r >= utf32Start: |
||||
r = utf32End + 1 + r - utf32Start |
||||
dest = fsa.run([tup[0] for tup in toUTF8(r)]) |
||||
if dest != -1: |
||||
print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) |
||||
return False |
||||
|
||||
return True |
||||
|
||||
def stress(): |
||||
|
||||
print 'Testing...' |
||||
|
||||
iter = 0 |
||||
while True: |
||||
if iter % 10 == 0: |
||||
print '%s...' % iter |
||||
iter += 1 |
||||
|
||||
v1 = random.randint(0, MAX_UNICODE) |
||||
v2 = random.randint(0, MAX_UNICODE) |
||||
if v2 < v1: |
||||
v1, v2 = v2, v1 |
||||
|
||||
utf32Start = v1 |
||||
utf32End = v2 |
||||
|
||||
fsa = FSA() |
||||
fsa.start = fsa.addNode('start') |
||||
fsa.end = fsa.addNode('end') |
||||
build(fsa, fsa.start, fsa.end, |
||||
toUTF8(utf32Start), |
||||
toUTF8(utf32End)) |
||||
|
||||
if not test(fsa, utf32Start, utf32End, 10000): |
||||
print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End) |
||||
|
||||
if __name__ == '__main__': |
||||
if len(sys.argv) > 1: |
||||
main() |
||||
else: |
||||
stress() |
@ -1,500 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
# Note, this file is known to work with rev 120 of the moman |
||||
# repository (http://bitbucket.org/jpbarrette/moman/overview) |
||||
# |
||||
# See also: http://sites.google.com/site/rrettesite/moman |
||||
|
||||
import math |
||||
import os |
||||
import sys |
||||
#sys.path.insert(0, 'moman/finenight/python') |
||||
sys.path.insert(0, '../../../../../../../../build/core/moman/finenight/python') |
||||
try: |
||||
from possibleStates import genTransitions |
||||
except ImportError: |
||||
from finenight.possibleStates import genTransitions |
||||
|
||||
MODE = 'array' |
||||
PACKED = True |
||||
WORD = 64 |
||||
LOG2_WORD = int(math.log(WORD)/math.log(2)) |
||||
#MODE = 'switch' |
||||
|
||||
class LineOutput: |
||||
|
||||
def __init__(self, indent=''): |
||||
self.l = [] |
||||
self._indent = self.startIndent = indent |
||||
self.inComment = False |
||||
|
||||
def __call__(self, s, indent=0): |
||||
if s.find('}') != -1: |
||||
assert self._indent != self.startIndent |
||||
self._indent = self._indent[:-2] |
||||
|
||||
if indent != 0: |
||||
indent0 = ' ' * (len(self._indent)/2+indent) |
||||
else: |
||||
indent0 = self._indent |
||||
|
||||
if s.find('/*') != -1: |
||||
if s.find('*/') == -1: |
||||
self.inComment = True |
||||
elif s.find('*/') != -1: |
||||
self.inComment = True |
||||
|
||||
if self.inComment: |
||||
self.l.append(indent0 + s) |
||||
else: |
||||
self.l.append(indent0 + s.lstrip()) |
||||
|
||||
self.inComment = self.inComment and s.find('*/') == -1 |
||||
|
||||
if s.find('{') != -1: |
||||
self._indent += ' ' |
||||
|
||||
def __str__(self): |
||||
if True: |
||||
assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \ |
||||
(len(self._indent), len(self.startIndent)) |
||||
return '\n'.join(self.l) |
||||
|
||||
def indent(self): |
||||
self._indent += ' ' |
||||
|
||||
def outdent(self): |
||||
assert self._indent != self.startIndent |
||||
self._indent = self._indent[:-2] |
||||
|
||||
def charVarNumber(charVar): |
||||
""" |
||||
Maps binary number (eg [1, 0, 1]) to its decimal value (5). |
||||
""" |
||||
|
||||
p = 1 |
||||
sum = 0 |
||||
downTo = len(charVar)-1 |
||||
while downTo >= 0: |
||||
sum += p * int(charVar[downTo]) |
||||
p *= 2 |
||||
downTo -= 1 |
||||
return sum |
||||
|
||||
def main(): |
||||
|
||||
if len(sys.argv) != 3: |
||||
print |
||||
print 'Usage: python -u %s N <True/False>' % sys.argv[0] |
||||
print |
||||
print 'NOTE: the resulting .java file is created in the current working dir!' |
||||
print |
||||
sys.exit(1) |
||||
|
||||
n = int(sys.argv[1]) |
||||
|
||||
transpose = (sys.argv[2] == "True") |
||||
|
||||
tables = genTransitions(n, transpose) |
||||
|
||||
stateMap = {} |
||||
|
||||
# init null state |
||||
stateMap['[]'] = -1 |
||||
|
||||
# init start state |
||||
stateMap['[(0, 0)]'] = 0 |
||||
|
||||
w = LineOutput() |
||||
|
||||
w('package com.fr.third.org.apache.lucene.util.automaton;') |
||||
w('') |
||||
w('/*') |
||||
w(' * Licensed to the Apache Software Foundation (ASF) under one or more') |
||||
w(' * contributor license agreements. See the NOTICE file distributed with') |
||||
w(' * this work for additional information regarding copyright ownership.') |
||||
w(' * The ASF licenses this file to You under the Apache License, Version 2.0') |
||||
w(' * (the "License"); you may not use this file except in compliance with') |
||||
w(' * the License. You may obtain a copy of the License at') |
||||
w(' *') |
||||
w(' * http://www.apache.org/licenses/LICENSE-2.0') |
||||
w(' *') |
||||
w(' * Unless required by applicable law or agreed to in writing, software') |
||||
w(' * distributed under the License is distributed on an "AS IS" BASIS,') |
||||
w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') |
||||
w(' * See the License for the specific language governing permissions and') |
||||
w(' * limitations under the License.') |
||||
w(' */') |
||||
w('') |
||||
w('// The following code was generated with the moman/finenight pkg') |
||||
w('// This package is available under the MIT License, see NOTICE.txt') |
||||
w('// for more details.') |
||||
w('') |
||||
w('import com.fr.third.org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') |
||||
w('') |
||||
if transpose: |
||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) |
||||
w(' with transpositions as primitive edits */') |
||||
className = 'Lev%dTParametricDescription' % n |
||||
else: |
||||
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) |
||||
className = 'Lev%dParametricDescription' % n |
||||
|
||||
w('class %s extends ParametricDescription {' % className) |
||||
|
||||
w('') |
||||
w('@Override') |
||||
w('int transition(int absState, int position, int vector) {') |
||||
|
||||
w(' // null absState should never be passed in') |
||||
w(' assert absState != -1;') |
||||
|
||||
w('') |
||||
w(' // decode absState -> state, offset') |
||||
w(' int state = absState/(w+1);') |
||||
w(' int offset = absState%(w+1);') |
||||
w(' assert offset >= 0;') |
||||
w('') |
||||
|
||||
machines = [] |
||||
|
||||
for i, map in enumerate(tables): |
||||
if i == 0: |
||||
w('if (position == w) {') |
||||
elif i == len(tables)-1: |
||||
w('} else {') |
||||
else: |
||||
w('} else if (position == w-%d) {' % i) |
||||
|
||||
if i != 0 and MODE == 'switch': |
||||
w('switch(vector) {') |
||||
|
||||
l = map.items() |
||||
l.sort() |
||||
|
||||
numCasesPerVector = None |
||||
numVectors = len(l) |
||||
|
||||
if MODE == 'array': |
||||
toStateArray = [] |
||||
toOffsetIncrArray = [] |
||||
|
||||
for charVar, states in l: |
||||
|
||||
# somehow it's a string: |
||||
charVar = eval(charVar) |
||||
|
||||
if i != 0 and MODE == 'switch': |
||||
w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) |
||||
w.indent() |
||||
|
||||
l = states.items() |
||||
|
||||
byFromState = {} |
||||
|
||||
# first pass to assign states |
||||
byAction = {} |
||||
for s, (toS, offset) in l: |
||||
state = str(s) |
||||
|
||||
toState = str(toS) |
||||
if state not in stateMap: |
||||
stateMap[state] = len(stateMap)-1 |
||||
if toState not in stateMap: |
||||
stateMap[toState] = len(stateMap)-1 |
||||
|
||||
byFromState[stateMap[state]] = (1+stateMap[toState], offset) |
||||
|
||||
fromStateDesc = s[1:len(s)-1] |
||||
toStateDesc = ', '.join([str(x) for x in toS]) |
||||
|
||||
tup = (stateMap[toState], toStateDesc, offset) |
||||
if tup not in byAction: |
||||
byAction[tup] = [] |
||||
byAction[tup].append((fromStateDesc, stateMap[state])) |
||||
|
||||
if numCasesPerVector is None: |
||||
numCasesPerVector = len(l) |
||||
else: |
||||
# we require this to be uniform... empirically it seems to be! |
||||
assert numCasesPerVector == len(l) |
||||
|
||||
if MODE == 'array': |
||||
|
||||
for s in range(numCasesPerVector): |
||||
toState, offsetIncr = byFromState[s] |
||||
toStateArray.append(toState) |
||||
toOffsetIncrArray.append(offsetIncr) |
||||
|
||||
else: |
||||
|
||||
# render switches |
||||
w('switch(state) { // %s cases' % len(l)) |
||||
|
||||
for (toState, toStateDesc, offset), lx in byAction.items(): |
||||
for fromStateDesc, fromState in lx: |
||||
w('case %s: // %s' % (fromState, fromStateDesc)) |
||||
w.indent() |
||||
w(' state = %s; // %s' % (toState, toStateDesc)) |
||||
if offset > 0: |
||||
w(' offset += %s;' % offset) |
||||
w('break;') |
||||
w.outdent() |
||||
|
||||
w('}') |
||||
if i != 0: |
||||
w('break;') |
||||
w.outdent() |
||||
|
||||
if MODE == 'array': |
||||
# strangely state can come in wildly out of bounds.... |
||||
w(' if (state < %d) {' % numCasesPerVector) |
||||
w(' final int loc = vector * %d + state;' % numCasesPerVector) |
||||
if PACKED: |
||||
w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) |
||||
w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) |
||||
else: |
||||
w(' offset += offsetIncrs%d[loc];' % i) |
||||
w(' state = toStates%d[loc]-1;' % i) |
||||
w(' }') |
||||
elif i != 0: |
||||
w('}') |
||||
|
||||
machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) |
||||
|
||||
# ends switch statement for machine |
||||
w('}') |
||||
|
||||
w('') |
||||
|
||||
w(' if (state == -1) {') |
||||
w(' // null state') |
||||
w(' return -1;') |
||||
w(' } else {') |
||||
w(' // translate back to abs') |
||||
w(' return state*(w+1)+offset;') |
||||
w(' }') |
||||
|
||||
# ends transition method |
||||
w('}') |
||||
|
||||
subs = [] |
||||
if MODE == 'array': |
||||
w.indent() |
||||
for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): |
||||
w('') |
||||
w.outdent() |
||||
w('// %d vectors; %d states per vector; array length = %d' % \ |
||||
(numVectors, numCasesPerVector, numVectors*numCasesPerVector)) |
||||
w.indent() |
||||
if PACKED: |
||||
# pack in python |
||||
l, nbits = pack(toStateArray) |
||||
subs.append(('NBITSSTATES%d' % i, str(nbits))) |
||||
w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ |
||||
(i, nbits, renderList([hex(long(x)) for x in l]))) |
||||
|
||||
l, nbits = pack(toOffsetIncrsArray) |
||||
subs.append(('NBITSOFFSET%d' % i, str(nbits))) |
||||
w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ |
||||
(i, nbits, renderList([hex(long(x)) for x in l]))) |
||||
else: |
||||
w(' private final static int[] toStates%d = new int[] %s;' % \ |
||||
(i, renderList([str(x) for x in toStateArray]))) |
||||
w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ |
||||
(i, renderList([str(x) for x in toStateArray]))) |
||||
w.outdent() |
||||
|
||||
stateMap2 = dict([[v,k] for k,v in stateMap.items()]) |
||||
w('') |
||||
w('// state map') |
||||
sum = 0 |
||||
minErrors = [] |
||||
for i in xrange(len(stateMap2)-1): |
||||
w('// %s -> %s' % (i, stateMap2[i])) |
||||
# we replace t-notation as its not relevant here |
||||
st = stateMap2[i].replace('t', '') |
||||
|
||||
v = eval(st) |
||||
minError = min([-i+e for i, e in v]) |
||||
c = len(v) |
||||
sum += c |
||||
minErrors.append(minError) |
||||
w('') |
||||
|
||||
w.indent() |
||||
#w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) |
||||
|
||||
w.outdent() |
||||
|
||||
w('') |
||||
w(' public %s(int w) {' % className) |
||||
w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) |
||||
w(' }') |
||||
|
||||
if 0: |
||||
w('') |
||||
w('@Override') |
||||
w('public int size() { // this can now move up?') |
||||
w(' return %d*(w+1);' % (len(stateMap2)-1)) |
||||
w('}') |
||||
|
||||
w('') |
||||
w('@Override') |
||||
w('public int getPosition(int absState) { // this can now move up?') |
||||
w(' return absState % (w+1);') |
||||
w('}') |
||||
|
||||
w('') |
||||
w('@Override') |
||||
w('public boolean isAccept(int absState) { // this can now move up?') |
||||
w(' // decode absState -> state, offset') |
||||
w(' int state = absState/(w+1);') |
||||
w(' if (true || state < minErrors.length) {') |
||||
w(' int offset = absState%(w+1);') |
||||
w(' assert offset >= 0;') |
||||
w(' return w - offset + minErrors[state] <= %d;' % n) |
||||
w(' } else {') |
||||
w(' return false;') |
||||
w(' }') |
||||
w('}') |
||||
|
||||
if MODE == 'array' and PACKED: |
||||
|
||||
# we moved into super class |
||||
if False: |
||||
w('') |
||||
|
||||
v = 2 |
||||
l = [] |
||||
for i in range(63): |
||||
l.append(hex(v-1)) |
||||
v *= 2 |
||||
|
||||
w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) |
||||
w('') |
||||
|
||||
# unpack in java |
||||
w('private int unpack(long[] data, int index, int bitsPerValue) {') |
||||
w(' final long bitLoc = bitsPerValue * index;') |
||||
w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) |
||||
w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1)) |
||||
w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') |
||||
w(' if (bitStart + bitsPerValue <= %d) {' % WORD) |
||||
w(' // not split') |
||||
w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') |
||||
w(' } else {') |
||||
w(' // split') |
||||
w(' final int part = %d-bitStart;' % WORD) |
||||
w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') |
||||
w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) |
||||
w(' }') |
||||
w('}') |
||||
|
||||
# class |
||||
w('}') |
||||
w('') |
||||
|
||||
fileOut = '%s.java' % className |
||||
|
||||
s = str(w) |
||||
for sub, repl in subs: |
||||
s = s.replace(sub, repl) |
||||
|
||||
open(fileOut, 'wb').write(s) |
||||
|
||||
print 'Wrote %s [%d lines; %.1f KB]' % \ |
||||
(fileOut, len(w.l), os.path.getsize(fileOut)/1024.) |
||||
|
||||
def renderList(l): |
||||
lx = [' '] |
||||
for i in xrange(len(l)): |
||||
if i > 0: |
||||
lx.append(',') |
||||
if i % 4 == 0: |
||||
lx.append('\n ') |
||||
lx.append(l[i]) |
||||
return '{\n%s\n }' % ''.join(lx) |
||||
|
||||
MASKS = [] |
||||
v = 2 |
||||
for i in xrange(63): |
||||
MASKS.append(v-1) |
||||
v *= 2 |
||||
|
||||
# packs into longs; returns long[], numBits |
||||
def pack(l): |
||||
maxV = max(l) |
||||
bitsPerValue = max(1, int(math.ceil(math.log(maxV+1)/math.log(2.0)))) |
||||
|
||||
bitsLeft = WORD |
||||
pendingValue = 0 |
||||
|
||||
packed = [] |
||||
for i in xrange(len(l)): |
||||
v = l[i] |
||||
if pendingValue > 0: |
||||
bitsUsed = math.ceil(math.log(pendingValue)/math.log(2.0)) |
||||
assert bitsUsed <= (WORD-bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD-bitsLeft, bitsUsed) |
||||
|
||||
if bitsLeft >= bitsPerValue: |
||||
pendingValue += v << (WORD-bitsLeft) |
||||
bitsLeft -= bitsPerValue |
||||
if bitsLeft == 0: |
||||
packed.append(pendingValue) |
||||
bitsLeft = WORD |
||||
pendingValue = 0 |
||||
else: |
||||
# split |
||||
|
||||
# bottom bitsLeft go in current word: |
||||
pendingValue += (v & MASKS[bitsLeft-1]) << (WORD-bitsLeft) |
||||
packed.append(pendingValue) |
||||
|
||||
pendingValue = v >> bitsLeft |
||||
bitsLeft = WORD - (bitsPerValue-bitsLeft) |
||||
|
||||
if bitsLeft < WORD: |
||||
packed.append(pendingValue) |
||||
|
||||
# verify(l, packed, bitsPerValue) |
||||
|
||||
return packed, bitsPerValue |
||||
|
||||
def verify(data, packedData, bitsPerValue): |
||||
for i in range(len(data)): |
||||
assert data[i] == unpack(packedData, i, bitsPerValue) |
||||
|
||||
def unpack(data, index, bitsPerValue): |
||||
bitLoc = bitsPerValue * index |
||||
dataLoc = int(bitLoc >> LOG2_WORD) |
||||
bitStart = int(bitLoc & (WORD-1)) |
||||
if bitStart + bitsPerValue <= WORD: |
||||
# not split |
||||
return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1])) |
||||
else: |
||||
# split |
||||
part = WORD-bitStart; |
||||
return int((((data[dataLoc] >> bitStart) & MASKS[part-1]) + |
||||
((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part))) |
||||
|
||||
if __name__ == '__main__': |
||||
if not __debug__: |
||||
print |
||||
print 'ERROR: please run without -O' |
||||
print |
||||
sys.exit(1) |
||||
main() |
@ -1,335 +0,0 @@
|
||||
#! /usr/bin/env python |
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
from fractions import gcd |
||||
|
||||
"""Code generation for bulk operations""" |
||||
|
||||
MAX_SPECIALIZED_BITS_PER_VALUE = 24; |
||||
PACKED_64_SINGLE_BLOCK_BPV = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] |
||||
OUTPUT_FILE = "BulkOperation.java" |
||||
HEADER = """// This file has been automatically generated, DO NOT EDIT |
||||
|
||||
package com.fr.third.org.apache.lucene.util.packed; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
""" |
||||
|
||||
FOOTER=""" |
||||
protected int writeLong(long block, byte[] blocks, int blocksOffset) { |
||||
for (int j = 1; j <= 8; ++j) { |
||||
blocks[blocksOffset++] = (byte) (block >>> (64 - (j << 3))); |
||||
} |
||||
return blocksOffset; |
||||
} |
||||
|
||||
/** |
||||
* For every number of bits per value, there is a minimum number of |
||||
* blocks (b) / values (v) you need to write in order to reach the next block |
||||
* boundary: |
||||
* - 16 bits per value -> b=1, v=4 |
||||
* - 24 bits per value -> b=3, v=8 |
||||
* - 50 bits per value -> b=25, v=32 |
||||
* - 63 bits per value -> b=63, v=64 |
||||
* - ... |
||||
* |
||||
* A bulk read consists in copying <code>iterations*v</code> values that are |
||||
* contained in <code>iterations*b</code> blocks into a <code>long[]</code> |
||||
* (higher values of <code>iterations</code> are likely to yield a better |
||||
* throughput) => this requires n * (b + v) longs in memory. |
||||
* |
||||
* This method computes <code>iterations</code> as |
||||
* <code>ramBudget / (8 * (b + v))</code> (since a long is 8 bytes). |
||||
*/ |
||||
public final int computeIterations(int valueCount, int ramBudget) { |
||||
final int iterations = (ramBudget >>> 3) / (blockCount() + valueCount()); |
||||
if (iterations == 0) { |
||||
// at least 1 |
||||
return 1; |
||||
} else if ((iterations - 1) * blockCount() >= valueCount) { |
||||
// don't allocate for more than the size of the reader |
||||
return (int) Math.ceil((double) valueCount / valueCount()); |
||||
} else { |
||||
return iterations; |
||||
} |
||||
} |
||||
} |
||||
""" |
||||
|
||||
def is_power_of_two(n): |
||||
return n & (n - 1) == 0 |
||||
|
||||
def casts(typ): |
||||
cast_start = "(%s) (" %typ |
||||
cast_end = ")" |
||||
if typ == "long": |
||||
cast_start = "" |
||||
cast_end = "" |
||||
return cast_start, cast_end |
||||
|
||||
def hexNoLSuffix(n): |
||||
# On 32 bit Python values > (1 << 31)-1 will have L appended by hex function: |
||||
s = hex(n) |
||||
if s.endswith('L'): |
||||
s = s[:-1] |
||||
return s |
||||
|
||||
def masks(bits): |
||||
if bits == 64: |
||||
return "", "" |
||||
return "(", " & %sL)" %(hexNoLSuffix((1 << bits) - 1)) |
||||
|
||||
def get_type(bits): |
||||
if bits == 8: |
||||
return "byte" |
||||
elif bits == 16: |
||||
return "short" |
||||
elif bits == 32: |
||||
return "int" |
||||
elif bits == 64: |
||||
return "long" |
||||
else: |
||||
assert False |
||||
|
||||
def block_value_count(bpv, bits=64): |
||||
blocks = bpv |
||||
values = blocks * bits / bpv |
||||
while blocks % 2 == 0 and values % 2 == 0: |
||||
blocks /= 2 |
||||
values /= 2 |
||||
assert values * bpv == bits * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv) |
||||
return (blocks, values) |
||||
|
||||
def packed64(bpv, f): |
||||
blocks, values = block_value_count(bpv) |
||||
mask = (1 << bpv) - 1 |
||||
|
||||
f.write("\n") |
||||
f.write(" public BulkOperationPacked%d() {\n" %bpv) |
||||
f.write(" super(%d);\n" %bpv) |
||||
f.write(" assert blockCount() == %d;\n" %blocks) |
||||
f.write(" assert valueCount() == %d;\n" %values) |
||||
f.write(" }\n\n") |
||||
|
||||
if bpv == 64: |
||||
f.write(""" @Override |
||||
public void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) { |
||||
System.arraycopy(blocks, blocksOffset, values, valuesOffset, valueCount() * iterations); |
||||
} |
||||
|
||||
@Override |
||||
public void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) { |
||||
throw new UnsupportedOperationException(); |
||||
} |
||||
|
||||
@Override |
||||
public void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) { |
||||
throw new UnsupportedOperationException(); |
||||
} |
||||
|
||||
@Override |
||||
public void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) { |
||||
LongBuffer.wrap(values, valuesOffset, iterations * valueCount()).put(ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer()); |
||||
} |
||||
""") |
||||
else: |
||||
p64_decode(bpv, f, 32) |
||||
p64_decode(bpv, f, 64) |
||||
|
||||
def p64_decode(bpv, f, bits): |
||||
blocks, values = block_value_count(bpv) |
||||
typ = get_type(bits) |
||||
cast_start, cast_end = casts(typ) |
||||
|
||||
f.write(" @Override\n") |
||||
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ) |
||||
if bits < bpv: |
||||
f.write(" throw new UnsupportedOperationException();\n") |
||||
else: |
||||
f.write(" for (int i = 0; i < iterations; ++i) {\n") |
||||
mask = (1 << bpv) - 1 |
||||
|
||||
if is_power_of_two(bpv): |
||||
f.write(" final long block = blocks[blocksOffset++];\n") |
||||
f.write(" for (int shift = %d; shift >= 0; shift -= %d) {\n" %(64 - bpv, bpv)) |
||||
f.write(" values[valuesOffset++] = %s(block >>> shift) & %d%s;\n" %(cast_start, mask, cast_end)) |
||||
f.write(" }\n") |
||||
else: |
||||
for i in xrange(0, values): |
||||
block_offset = i * bpv / 64 |
||||
bit_offset = (i * bpv) % 64 |
||||
if bit_offset == 0: |
||||
# start of block |
||||
f.write(" final long block%d = blocks[blocksOffset++];\n" %block_offset); |
||||
f.write(" values[valuesOffset++] = %sblock%d >>> %d%s;\n" %(cast_start, block_offset, 64 - bpv, cast_end)) |
||||
elif bit_offset + bpv == 64: |
||||
# end of block |
||||
f.write(" values[valuesOffset++] = %sblock%d & %dL%s;\n" %(cast_start, block_offset, mask, cast_end)) |
||||
elif bit_offset + bpv < 64: |
||||
# middle of block |
||||
f.write(" values[valuesOffset++] = %s(block%d >>> %d) & %dL%s;\n" %(cast_start, block_offset, 64 - bit_offset - bpv, mask, cast_end)) |
||||
else: |
||||
# value spans across 2 blocks |
||||
mask1 = (1 << (64 - bit_offset)) -1 |
||||
shift1 = bit_offset + bpv - 64 |
||||
shift2 = 64 - shift1 |
||||
f.write(" final long block%d = blocks[blocksOffset++];\n" %(block_offset + 1)); |
||||
f.write(" values[valuesOffset++] = %s((block%d & %dL) << %d) | (block%d >>> %d)%s;\n" %(cast_start, block_offset, mask1, shift1, block_offset + 1, shift2, cast_end)) |
||||
f.write(" }\n") |
||||
f.write(" }\n\n") |
||||
|
||||
byte_blocks, byte_values = block_value_count(bpv, 8) |
||||
|
||||
f.write(" @Override\n") |
||||
f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ) |
||||
if bits < bpv: |
||||
f.write(" throw new UnsupportedOperationException();\n") |
||||
else: |
||||
|
||||
if is_power_of_two(bpv) and bpv < 8: |
||||
f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n") |
||||
f.write(" final byte block = blocks[blocksOffset++];\n") |
||||
for shift in xrange(8 - bpv, 0, -bpv): |
||||
f.write(" values[valuesOffset++] = (block >>> %d) & %d;\n" %(shift, mask)) |
||||
f.write(" values[valuesOffset++] = block & %d;\n" %mask) |
||||
f.write(" }\n") |
||||
elif bpv == 8: |
||||
f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n") |
||||
f.write(" values[valuesOffset++] = blocks[blocksOffset++] & 0xFF;\n") |
||||
f.write(" }\n") |
||||
elif is_power_of_two(bpv) and bpv > 8: |
||||
f.write(" for (int j = 0; j < %d * iterations; ++j) {\n" %(64 / bpv)) |
||||
m = bits <= 32 and "0xFF" or "0xFFL" |
||||
f.write(" values[valuesOffset++] =") |
||||
for i in xrange(bpv / 8 - 1): |
||||
f.write(" ((blocks[blocksOffset++] & %s) << %d) |" %(m, bpv - 8)) |
||||
f.write(" (blocks[blocksOffset++] & %s);\n" %m) |
||||
f.write(" }\n") |
||||
else: |
||||
f.write(" for (int i = 0; i < 8 * iterations; ++i) {\n") |
||||
for i in xrange(0, byte_values): |
||||
byte_start = i * bpv / 8 |
||||
bit_start = (i * bpv) % 8 |
||||
byte_end = ((i + 1) * bpv - 1) / 8 |
||||
bit_end = ((i + 1) * bpv - 1) % 8 |
||||
shift = lambda b: 8 * (byte_end - b - 1) + 1 + bit_end |
||||
if bit_start == 0: |
||||
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, byte_start)) |
||||
for b in xrange(byte_start + 1, byte_end + 1): |
||||
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, b)) |
||||
f.write(" values[valuesOffset++] =") |
||||
if byte_start == byte_end: |
||||
if bit_start == 0: |
||||
if bit_end == 7: |
||||
f.write(" byte%d" %byte_start) |
||||
else: |
||||
f.write(" byte%d >>> %d" %(byte_start, 7 - bit_end)) |
||||
else: |
||||
if bit_end == 7: |
||||
f.write(" byte%d & %d" %(byte_start, 2 ** (8 - bit_start) - 1)) |
||||
else: |
||||
f.write(" (byte%d >>> %d) & %d" %(byte_start, 7 - bit_end, 2 ** (bit_end - bit_start + 1) - 1)) |
||||
else: |
||||
if bit_start == 0: |
||||
f.write(" (byte%d << %d)" %(byte_start, shift(byte_start))) |
||||
else: |
||||
f.write(" ((byte%d & %d) << %d)" %(byte_start, 2 ** (8 - bit_start) - 1, shift(byte_start))) |
||||
for b in xrange(byte_start + 1, byte_end): |
||||
f.write(" | (byte%d << %d)" %(b, shift(b))) |
||||
if bit_end == 7: |
||||
f.write(" | byte%d" %byte_end) |
||||
else: |
||||
f.write(" | (byte%d >>> %d)" %(byte_end, 7 - bit_end)) |
||||
f.write(";\n") |
||||
f.write(" }\n") |
||||
f.write(" }\n\n") |
||||
|
||||
if __name__ == '__main__': |
||||
f = open(OUTPUT_FILE, 'w') |
||||
f.write(HEADER) |
||||
f.write('\n') |
||||
f.write('''/** |
||||
* Efficient sequential read/write of packed integers. |
||||
*/\n''') |
||||
|
||||
f.write('abstract class BulkOperation implements PackedInts.Decoder, PackedInts.Encoder {\n') |
||||
f.write(' private static final BulkOperation[] packedBulkOps = new BulkOperation[] {\n') |
||||
|
||||
for bpv in xrange(1, 65): |
||||
if bpv > MAX_SPECIALIZED_BITS_PER_VALUE: |
||||
f.write(' new BulkOperationPacked(%d),\n' % bpv) |
||||
continue |
||||
f2 = open('BulkOperationPacked%d.java' % bpv, 'w') |
||||
f2.write(HEADER) |
||||
if bpv == 64: |
||||
f2.write('import java.nio.LongBuffer;\n') |
||||
f2.write('import java.nio.ByteBuffer;\n') |
||||
f2.write('\n') |
||||
f2.write('''/** |
||||
* Efficient sequential read/write of packed integers. |
||||
*/\n''') |
||||
f2.write('final class BulkOperationPacked%d extends BulkOperationPacked {\n' % bpv) |
||||
packed64(bpv, f2) |
||||
f2.write('}\n') |
||||
f2.close() |
||||
f.write(' new BulkOperationPacked%d(),\n' % bpv) |
||||
|
||||
f.write(' };\n') |
||||
f.write('\n') |
||||
|
||||
f.write(' // NOTE: this is sparse (some entries are null):\n') |
||||
f.write(' private static final BulkOperation[] packedSingleBlockBulkOps = new BulkOperation[] {\n') |
||||
for bpv in xrange(1, max(PACKED_64_SINGLE_BLOCK_BPV)+1): |
||||
if bpv in PACKED_64_SINGLE_BLOCK_BPV: |
||||
f.write(' new BulkOperationPackedSingleBlock(%d),\n' % bpv) |
||||
else: |
||||
f.write(' null,\n') |
||||
f.write(' };\n') |
||||
f.write('\n') |
||||
|
||||
f.write("\n") |
||||
f.write(" public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {\n") |
||||
f.write(" switch (format) {\n") |
||||
|
||||
f.write(" case PACKED:\n") |
||||
f.write(" assert packedBulkOps[bitsPerValue - 1] != null;\n") |
||||
f.write(" return packedBulkOps[bitsPerValue - 1];\n") |
||||
f.write(" case PACKED_SINGLE_BLOCK:\n") |
||||
f.write(" assert packedSingleBlockBulkOps[bitsPerValue - 1] != null;\n") |
||||
f.write(" return packedSingleBlockBulkOps[bitsPerValue - 1];\n") |
||||
f.write(" default:\n") |
||||
f.write(" throw new AssertionError();\n") |
||||
f.write(" }\n") |
||||
f.write(" }\n") |
||||
f.write(FOOTER) |
||||
f.close() |
@ -1,175 +0,0 @@
|
||||
#! /usr/bin/env python |
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
HEADER="""// This file has been automatically generated, DO NOT EDIT |
||||
|
||||
package com.fr.third.org.apache.lucene.util.packed; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.store.DataInput; |
||||
import com.fr.third.org.apache.lucene.util.RamUsageEstimator; |
||||
|
||||
import java.io.IOException; |
||||
import java.util.Arrays; |
||||
|
||||
""" |
||||
|
||||
TYPES = {8: "byte", 16: "short", 32: "int", 64: "long"} |
||||
MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""} |
||||
CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""} |
||||
|
||||
if __name__ == '__main__': |
||||
for bpv in TYPES.keys(): |
||||
type |
||||
f = open("Direct%d.java" %bpv, 'w') |
||||
f.write(HEADER) |
||||
f.write("""/** |
||||
* Direct wrapping of %d-bits values to a backing array. |
||||
* @lucene.internal |
||||
*/\n""" %bpv) |
||||
f.write("final class Direct%d extends PackedInts.MutableImpl {\n" %bpv) |
||||
f.write(" final %s[] values;\n\n" %TYPES[bpv]) |
||||
|
||||
f.write(" Direct%d(int valueCount) {\n" %bpv) |
||||
f.write(" super(valueCount, %d);\n" %bpv) |
||||
f.write(" values = new %s[valueCount];\n" %TYPES[bpv]) |
||||
f.write(" }\n\n") |
||||
|
||||
f.write(" Direct%d(DataInput in, int valueCount) throws IOException {\n" %bpv) |
||||
f.write(" this(valueCount);\n") |
||||
f.write(" for (int i = 0; i < valueCount; ++i) {\n") |
||||
f.write(" values[i] = in.read%s();\n" %TYPES[bpv].title()) |
||||
f.write(" }\n") |
||||
if bpv != 64: |
||||
f.write(" final int mod = valueCount %% %d;\n" %(64 / bpv)) |
||||
f.write(" if (mod != 0) {\n") |
||||
f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv)) |
||||
f.write(" in.read%s();\n" %TYPES[bpv].title()) |
||||
f.write(" }\n") |
||||
f.write(" }\n") |
||||
f.write(" }\n") |
||||
|
||||
f.write(""" |
||||
@Override |
||||
public long get(final int index) { |
||||
return values[index]%s; |
||||
} |
||||
|
||||
public void set(final int index, final long value) { |
||||
values[index] = %s(value); |
||||
} |
||||
|
||||
public long ramBytesUsed() { |
||||
return RamUsageEstimator.sizeOf(values); |
||||
} |
||||
|
||||
public void clear() { |
||||
Arrays.fill(values, %s0L); |
||||
} |
||||
|
||||
@Override |
||||
public Object getArray() { |
||||
return values; |
||||
} |
||||
|
||||
@Override |
||||
public boolean hasArray() { |
||||
return true; |
||||
} |
||||
""" %(MASKS[bpv], CASTS[bpv], CASTS[bpv])) |
||||
|
||||
if bpv == 64: |
||||
f.write(""" |
||||
@Override |
||||
public int get(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
assert off + len <= arr.length; |
||||
|
||||
final int gets = Math.min(valueCount - index, len); |
||||
System.arraycopy(values, index, arr, off, gets); |
||||
return gets; |
||||
} |
||||
|
||||
public int set(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
assert off + len <= arr.length; |
||||
|
||||
final int sets = Math.min(valueCount - index, len); |
||||
System.arraycopy(arr, off, values, index, sets); |
||||
return sets; |
||||
} |
||||
|
||||
@Override |
||||
public void fill(int fromIndex, int toIndex, long val) { |
||||
Arrays.fill(values, fromIndex, toIndex, val); |
||||
} |
||||
""") |
||||
else: |
||||
f.write(""" |
||||
@Override |
||||
public int get(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
assert off + len <= arr.length; |
||||
|
||||
final int gets = Math.min(valueCount - index, len); |
||||
for (int i = index, o = off, end = index + gets; i < end; ++i, ++o) { |
||||
arr[o] = values[i]%s; |
||||
} |
||||
return gets; |
||||
} |
||||
|
||||
public int set(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
assert off + len <= arr.length; |
||||
|
||||
final int sets = Math.min(valueCount - index, len); |
||||
for (int i = index, o = off, end = index + sets; i < end; ++i, ++o) { |
||||
values[i] = %sarr[o]; |
||||
} |
||||
return sets; |
||||
} |
||||
|
||||
@Override |
||||
public void fill(int fromIndex, int toIndex, long val) { |
||||
assert val == (val%s); |
||||
Arrays.fill(values, fromIndex, toIndex, %sval); |
||||
} |
||||
""" %(MASKS[bpv], CASTS[bpv], MASKS[bpv], CASTS[bpv])) |
||||
|
||||
f.write("}\n") |
||||
|
||||
f.close() |
@ -1,291 +0,0 @@
|
||||
#! /usr/bin/env python |
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
SUPPORTED_BITS_PER_VALUE = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] |
||||
|
||||
HEADER="""// This file has been automatically generated, DO NOT EDIT |
||||
|
||||
package com.fr.third.org.apache.lucene.util.packed; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with this |
||||
* work for additional information regarding copyright ownership. The ASF |
||||
* licenses this file to You under the Apache License, Version 2.0 (the |
||||
* "License"); you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
||||
* License for the specific language governing permissions and limitations under |
||||
* the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.util.Arrays; |
||||
|
||||
import com.fr.third.org.apache.lucene.store.DataInput; |
||||
import com.fr.third.org.apache.lucene.util.RamUsageEstimator; |
||||
|
||||
/** |
||||
* This class is similar to {@link Packed64} except that it trades space for |
||||
* speed by ensuring that a single block needs to be read/written in order to |
||||
* read/write a value. |
||||
*/ |
||||
abstract class Packed64SingleBlock extends PackedInts.MutableImpl { |
||||
|
||||
public static final int MAX_SUPPORTED_BITS_PER_VALUE = %d; |
||||
private static final int[] SUPPORTED_BITS_PER_VALUE = new int[] {%s}; |
||||
|
||||
public static boolean isSupported(int bitsPerValue) { |
||||
return Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) >= 0; |
||||
} |
||||
|
||||
private static int requiredCapacity(int valueCount, int valuesPerBlock) { |
||||
return valueCount / valuesPerBlock |
||||
+ (valueCount %% valuesPerBlock == 0 ? 0 : 1); |
||||
} |
||||
|
||||
final long[] blocks; |
||||
|
||||
Packed64SingleBlock(int valueCount, int bitsPerValue) { |
||||
super(valueCount, bitsPerValue); |
||||
assert isSupported(bitsPerValue); |
||||
final int valuesPerBlock = 64 / bitsPerValue; |
||||
blocks = new long[requiredCapacity(valueCount, valuesPerBlock)]; |
||||
} |
||||
|
||||
@Override |
||||
public void clear() { |
||||
Arrays.fill(blocks, 0L); |
||||
} |
||||
|
||||
public long ramBytesUsed() { |
||||
return RamUsageEstimator.sizeOf(blocks); |
||||
} |
||||
|
||||
@Override |
||||
public int get(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
len = Math.min(len, valueCount - index); |
||||
assert off + len <= arr.length; |
||||
|
||||
final int originalIndex = index; |
||||
|
||||
// go to the next block boundary |
||||
final int valuesPerBlock = 64 / bitsPerValue; |
||||
final int offsetInBlock = index %% valuesPerBlock; |
||||
if (offsetInBlock != 0) { |
||||
for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) { |
||||
arr[off++] = get(index++); |
||||
--len; |
||||
} |
||||
if (len == 0) { |
||||
return index - originalIndex; |
||||
} |
||||
} |
||||
|
||||
// bulk get |
||||
assert index %% valuesPerBlock == 0; |
||||
final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); |
||||
assert decoder.blockCount() == 1; |
||||
assert decoder.valueCount() == valuesPerBlock; |
||||
final int blockIndex = index / valuesPerBlock; |
||||
final int nblocks = (index + len) / valuesPerBlock - blockIndex; |
||||
decoder.decode(blocks, blockIndex, arr, off, nblocks); |
||||
final int diff = nblocks * valuesPerBlock; |
||||
index += diff; len -= diff; |
||||
|
||||
if (index > originalIndex) { |
||||
// stay at the block boundary |
||||
return index - originalIndex; |
||||
} else { |
||||
// no progress so far => already at a block boundary but no full block to |
||||
// get |
||||
assert index == originalIndex; |
||||
return super.get(index, arr, off, len); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public int set(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
len = Math.min(len, valueCount - index); |
||||
assert off + len <= arr.length; |
||||
|
||||
final int originalIndex = index; |
||||
|
||||
// go to the next block boundary |
||||
final int valuesPerBlock = 64 / bitsPerValue; |
||||
final int offsetInBlock = index %% valuesPerBlock; |
||||
if (offsetInBlock != 0) { |
||||
for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) { |
||||
set(index++, arr[off++]); |
||||
--len; |
||||
} |
||||
if (len == 0) { |
||||
return index - originalIndex; |
||||
} |
||||
} |
||||
|
||||
// bulk set |
||||
assert index %% valuesPerBlock == 0; |
||||
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); |
||||
assert op.blockCount() == 1; |
||||
assert op.valueCount() == valuesPerBlock; |
||||
final int blockIndex = index / valuesPerBlock; |
||||
final int nblocks = (index + len) / valuesPerBlock - blockIndex; |
||||
op.encode(arr, off, blocks, blockIndex, nblocks); |
||||
final int diff = nblocks * valuesPerBlock; |
||||
index += diff; len -= diff; |
||||
|
||||
if (index > originalIndex) { |
||||
// stay at the block boundary |
||||
return index - originalIndex; |
||||
} else { |
||||
// no progress so far => already at a block boundary but no full block to |
||||
// set |
||||
assert index == originalIndex; |
||||
return super.set(index, arr, off, len); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void fill(int fromIndex, int toIndex, long val) { |
||||
assert fromIndex >= 0; |
||||
assert fromIndex <= toIndex; |
||||
assert PackedInts.bitsRequired(val) <= bitsPerValue; |
||||
|
||||
final int valuesPerBlock = 64 / bitsPerValue; |
||||
if (toIndex - fromIndex <= valuesPerBlock << 1) { |
||||
// there needs to be at least one full block to set for the block |
||||
// approach to be worth trying |
||||
super.fill(fromIndex, toIndex, val); |
||||
return; |
||||
} |
||||
|
||||
// set values naively until the next block start |
||||
int fromOffsetInBlock = fromIndex %% valuesPerBlock; |
||||
if (fromOffsetInBlock != 0) { |
||||
for (int i = fromOffsetInBlock; i < valuesPerBlock; ++i) { |
||||
set(fromIndex++, val); |
||||
} |
||||
assert fromIndex %% valuesPerBlock == 0; |
||||
} |
||||
|
||||
// bulk set of the inner blocks |
||||
final int fromBlock = fromIndex / valuesPerBlock; |
||||
final int toBlock = toIndex / valuesPerBlock; |
||||
assert fromBlock * valuesPerBlock == fromIndex; |
||||
|
||||
long blockValue = 0L; |
||||
for (int i = 0; i < valuesPerBlock; ++i) { |
||||
blockValue = blockValue | (val << (i * bitsPerValue)); |
||||
} |
||||
Arrays.fill(blocks, fromBlock, toBlock, blockValue); |
||||
|
||||
// fill the gap |
||||
for (int i = valuesPerBlock * toBlock; i < toIndex; ++i) { |
||||
set(i, val); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
protected PackedInts.Format getFormat() { |
||||
return PackedInts.Format.PACKED_SINGLE_BLOCK; |
||||
} |
||||
|
||||
@Override |
||||
public String toString() { |
||||
return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue |
||||
+ ", size=" + size() + ", elements.length=" + blocks.length + ")"; |
||||
} |
||||
|
||||
public static Packed64SingleBlock create(DataInput in, |
||||
int valueCount, int bitsPerValue) throws IOException { |
||||
Packed64SingleBlock reader = create(valueCount, bitsPerValue); |
||||
for (int i = 0; i < reader.blocks.length; ++i) { |
||||
reader.blocks[i] = in.readLong(); |
||||
} |
||||
return reader; |
||||
} |
||||
|
||||
""" %(SUPPORTED_BITS_PER_VALUE[-1], ", ".join(map(str, SUPPORTED_BITS_PER_VALUE))) |
||||
|
||||
FOOTER = "}" |
||||
|
||||
if __name__ == '__main__': |
||||
|
||||
f = open("Packed64SingleBlock.java", 'w') |
||||
f.write(HEADER) |
||||
f.write(" public static Packed64SingleBlock create(int valueCount, int bitsPerValue) {\n") |
||||
f.write(" switch (bitsPerValue) {\n") |
||||
for bpv in SUPPORTED_BITS_PER_VALUE: |
||||
f.write(" case %d:\n" %bpv) |
||||
f.write(" return new Packed64SingleBlock%d(valueCount);\n" %bpv) |
||||
f.write(" default:\n") |
||||
f.write(" throw new IllegalArgumentException(\"Unsupported number of bits per value: \" + %d);\n" %bpv) |
||||
f.write(" }\n") |
||||
f.write(" }\n\n") |
||||
|
||||
for bpv in SUPPORTED_BITS_PER_VALUE: |
||||
log_2 = 0 |
||||
while (1 << log_2) < bpv: |
||||
log_2 = log_2 + 1 |
||||
if (1 << log_2) != bpv: |
||||
log_2 = None |
||||
|
||||
f.write(" static class Packed64SingleBlock%d extends Packed64SingleBlock {\n\n" %bpv) |
||||
|
||||
f.write(" Packed64SingleBlock%d(int valueCount) {\n" %bpv) |
||||
f.write(" super(valueCount, %d);\n" %bpv) |
||||
f.write(" }\n\n") |
||||
|
||||
f.write(" @Override\n") |
||||
f.write(" public long get(int index) {\n") |
||||
if log_2 is not None: |
||||
f.write(" final int o = index >>> %d;\n" %(6 - log_2)) |
||||
f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1)) |
||||
f.write(" final int shift = b << %d;\n" %log_2) |
||||
else: |
||||
f.write(" final int o = index / %d;\n" %(64 / bpv)) |
||||
f.write(" final int b = index %% %d;\n" %(64 / bpv)) |
||||
f.write(" final int shift = b * %d;\n" %bpv) |
||||
f.write(" return (blocks[o] >>> shift) & %dL;\n" %((1 << bpv) - 1)) |
||||
f.write(" }\n\n") |
||||
|
||||
f.write(" @Override\n") |
||||
f.write(" public void set(int index, long value) {\n") |
||||
if log_2 is not None: |
||||
f.write(" final int o = index >>> %d;\n" %(6 - log_2)) |
||||
f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1)) |
||||
f.write(" final int shift = b << %d;\n" %log_2) |
||||
else: |
||||
f.write(" final int o = index / %d;\n" %(64 / bpv)) |
||||
f.write(" final int b = index %% %d;\n" %(64 / bpv)) |
||||
f.write(" final int shift = b * %d;\n" %bpv) |
||||
f.write(" blocks[o] = (blocks[o] & ~(%dL << shift)) | (value << shift);\n" % ((1 << bpv) - 1)) |
||||
f.write(" }\n\n") |
||||
f.write(" }\n\n") |
||||
|
||||
f.write(FOOTER) |
||||
f.close() |
@ -1,161 +0,0 @@
|
||||
#! /usr/bin/env python |
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
HEADER="""// This file has been automatically generated, DO NOT EDIT |
||||
|
||||
package com.fr.third.org.apache.lucene.util.packed; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.store.DataInput; |
||||
import com.fr.third.org.apache.lucene.util.RamUsageEstimator; |
||||
|
||||
import java.io.IOException; |
||||
import java.util.Arrays; |
||||
|
||||
""" |
||||
|
||||
TYPES = {8: "byte", 16: "short"} |
||||
MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""} |
||||
CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""} |
||||
|
||||
if __name__ == '__main__': |
||||
for bpv in TYPES.keys(): |
||||
type |
||||
f = open("Packed%dThreeBlocks.java" %bpv, 'w') |
||||
f.write(HEADER) |
||||
f.write("""/** |
||||
* Packs integers into 3 %ss (%d bits per value). |
||||
* @lucene.internal |
||||
*/\n""" %(TYPES[bpv], bpv*3)) |
||||
f.write("final class Packed%dThreeBlocks extends PackedInts.MutableImpl {\n" %bpv) |
||||
f.write(" final %s[] blocks;\n\n" %TYPES[bpv]) |
||||
|
||||
f.write(" public static final int MAX_SIZE = Integer.MAX_VALUE / 3;\n\n") |
||||
|
||||
f.write(" Packed%dThreeBlocks(int valueCount) {\n" %bpv) |
||||
f.write(" super(valueCount, %d);\n" %(bpv*3)) |
||||
f.write(" if (valueCount > MAX_SIZE) {\n") |
||||
f.write(" throw new ArrayIndexOutOfBoundsException(\"MAX_SIZE exceeded\");\n") |
||||
f.write(" }\n") |
||||
f.write(" blocks = new %s[valueCount * 3];\n" %TYPES[bpv]) |
||||
f.write(" }\n\n") |
||||
|
||||
f.write(" Packed%dThreeBlocks(DataInput in, int valueCount) throws IOException {\n" %bpv) |
||||
f.write(" this(valueCount);\n") |
||||
f.write(" for (int i = 0; i < 3 * valueCount; ++i) {\n") |
||||
f.write(" blocks[i] = in.read%s();\n" %TYPES[bpv].title()) |
||||
f.write(" }\n") |
||||
f.write(" final int mod = blocks.length %% %d;\n" %(64 / bpv)) |
||||
f.write(" if (mod != 0) {\n") |
||||
f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv)) |
||||
f.write(" in.read%s();\n" %TYPES[bpv].title()) |
||||
f.write(" }\n") |
||||
f.write(" }\n") |
||||
f.write(" }\n") |
||||
|
||||
f.write(""" |
||||
@Override |
||||
public long get(int index) { |
||||
final int o = index * 3; |
||||
return (blocks[o]%s) << %d | (blocks[o+1]%s) << %d | (blocks[o+2]%s); |
||||
} |
||||
|
||||
@Override |
||||
public int get(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
assert off + len <= arr.length; |
||||
|
||||
final int gets = Math.min(valueCount - index, len); |
||||
for (int i = index * 3, end = (index + gets) * 3; i < end; i+=3) { |
||||
arr[off++] = (blocks[i]%s) << %d | (blocks[i+1]%s) << %d | (blocks[i+2]%s); |
||||
} |
||||
return gets; |
||||
} |
||||
|
||||
@Override |
||||
public void set(int index, long value) { |
||||
final int o = index * 3; |
||||
blocks[o] = %s(value >>> %d); |
||||
blocks[o+1] = %s(value >>> %d); |
||||
blocks[o+2] = %svalue; |
||||
} |
||||
|
||||
@Override |
||||
public int set(int index, long[] arr, int off, int len) { |
||||
assert len > 0 : "len must be > 0 (got " + len + ")"; |
||||
assert index >= 0 && index < valueCount; |
||||
assert off + len <= arr.length; |
||||
|
||||
final int sets = Math.min(valueCount - index, len); |
||||
for (int i = off, o = index * 3, end = off + sets; i < end; ++i) { |
||||
final long value = arr[i]; |
||||
blocks[o++] = %s(value >>> %d); |
||||
blocks[o++] = %s(value >>> %d); |
||||
blocks[o++] = %svalue; |
||||
} |
||||
return sets; |
||||
} |
||||
|
||||
@Override |
||||
public void fill(int fromIndex, int toIndex, long val) { |
||||
final %s block1 = %s(val >>> %d); |
||||
final %s block2 = %s(val >>> %d); |
||||
final %s block3 = %sval; |
||||
for (int i = fromIndex * 3, end = toIndex * 3; i < end; i += 3) { |
||||
blocks[i] = block1; |
||||
blocks[i+1] = block2; |
||||
blocks[i+2] = block3; |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void clear() { |
||||
Arrays.fill(blocks, %s0); |
||||
} |
||||
|
||||
public long ramBytesUsed() { |
||||
return RamUsageEstimator.sizeOf(blocks); |
||||
} |
||||
|
||||
@Override |
||||
public String toString() { |
||||
return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue |
||||
+ ", size=" + size() + ", elements.length=" + blocks.length + ")"; |
||||
} |
||||
} |
||||
""" %(MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], CASTS[bpv], 2*bpv, CASTS[bpv], bpv, CASTS[bpv], CASTS[bpv], |
||||
2*bpv, CASTS[bpv], bpv, CASTS[bpv], TYPES[bpv], CASTS[bpv], 2*bpv, TYPES[bpv], |
||||
CASTS[bpv], bpv, TYPES[bpv], CASTS[bpv], CASTS[bpv])) |
||||
|
||||
f.close() |
Loading…
Reference in new issue