Browse Source

REPORT-110010 删除lucene里的py腳本

release/10.0
Yuan.Wang 1 year ago
parent
commit
9156f4439f
  1. 539
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py
  2. 366
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/UTF32ToUTF8.py
  3. 500
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/createLevAutomata.py
  4. 335
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_BulkOperation.py
  5. 175
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Direct.py
  6. 291
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Packed64SingleBlock.py
  7. 161
      fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_PackedThreeBlocks.py

539
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py

@ -1,539 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# A simple python script to generate an HTML entity map and a regex alternation
# for inclusion in HTMLStripCharFilter.jflex.
def main():
print get_apache_license()
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line, ')'
print '%{'
print ' private static final Map<String,String> upperCaseVariantsAccepted'
print ' = new HashMap<String,String>();'
print ' static {'
print ' upperCaseVariantsAccepted.put("quot", "QUOT");'
print ' upperCaseVariantsAccepted.put("copy", "COPY");'
print ' upperCaseVariantsAccepted.put("gt", "GT");'
print ' upperCaseVariantsAccepted.put("lt", "LT");'
print ' upperCaseVariantsAccepted.put("reg", "REG");'
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
print ' }'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
print ' static {'
print ' String[] entities = {'
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line[:-1]
print ' };'
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
print ' Character value = entities[i + 1].charAt(0);'
print ' entityValues.put(entities[i], value);'
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
print ' if (upperCaseVariant != null) {'
print ' entityValues.put(upperCaseVariant, value);'
print ' }'
print ' }'
print " }"
print "%}"
def get_entity_text():
# The text below is taken verbatim from
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
text = r"""
F.1. XHTML Character Entities
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
F.1.1. XHTML Latin 1 Character Entities
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
<!-- file: xhtml-lat1.ent
Typical invocation:
<!ENTITY % xhtml-lat1
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"xhtml-lat1.ent" >
%xhtml-lat1;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!ENTITY nbsp "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
<!ENTITY iexcl "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
<!ENTITY cent "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
<!ENTITY pound "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
<!ENTITY yen "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
<!ENTITY sect "&#167;" ><!-- section sign, U+00A7 ISOnum -->
<!ENTITY uml "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
<!ENTITY copy "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
<!ENTITY ordf "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
<!ENTITY laquo "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
<!ENTITY not "&#172;" ><!-- not sign, U+00AC ISOnum -->
<!ENTITY shy "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
<!ENTITY reg "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
<!ENTITY macr "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
<!ENTITY deg "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
<!ENTITY sup2 "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
<!ENTITY sup3 "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
<!ENTITY acute "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
<!ENTITY micro "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
<!ENTITY para "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
<!ENTITY cedil "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
<!ENTITY sup1 "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
<!ENTITY ordm "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
<!ENTITY raquo "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
<!ENTITY Acirc "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
<!ENTITY Auml "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
<!ENTITY Aring "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
<!ENTITY AElig "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
<!ENTITY Ecirc "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
<!ENTITY Euml "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
<!ENTITY Icirc "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
<!ENTITY Iuml "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
<!ENTITY ETH "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
<!ENTITY Ocirc "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
<!ENTITY Ouml "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
<!ENTITY times "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
<!ENTITY Ucirc "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
<!ENTITY Uuml "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
<!ENTITY THORN "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
<!ENTITY szlig "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
<!ENTITY acirc "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
<!ENTITY auml "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
<!ENTITY aring "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
<!ENTITY aelig "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
<!ENTITY ecirc "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
<!ENTITY euml "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
<!ENTITY icirc "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
<!ENTITY iuml "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
<!ENTITY eth "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
<!ENTITY ocirc "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
<!ENTITY ouml "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
<!ENTITY ucirc "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
<!ENTITY uuml "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
<!ENTITY thorn "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
<!ENTITY yuml "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
<!-- end of xhtml-lat1.ent -->
F.1.2. XHTML Special Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
<!-- file: xhtml-special.ent
Typical invocation:
<!ENTITY % xhtml-special
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
"xhtml-special.ent" >
%xhtml-special;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
Revisions:
2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- C0 Controls and Basic Latin -->
<!ENTITY lt "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
<!ENTITY gt "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
<!ENTITY amp "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
<!ENTITY apos "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
<!ENTITY quot "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
<!-- Latin Extended-A -->
<!ENTITY OElig "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
<!ENTITY oelig "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
<!-- ligature is a misnomer, this is a separate character in some languages -->
<!ENTITY Scaron "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
<!ENTITY scaron "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
<!ENTITY Yuml "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
<!-- Spacing Modifier Letters -->
<!ENTITY circ "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
<!ENTITY tilde "&#732;" ><!-- small tilde, U+02DC ISOdia -->
<!-- General Punctuation -->
<!ENTITY ensp "&#8194;" ><!-- en space, U+2002 ISOpub -->
<!ENTITY emsp "&#8195;" ><!-- em space, U+2003 ISOpub -->
<!ENTITY thinsp "&#8201;" ><!-- thin space, U+2009 ISOpub -->
<!ENTITY zwnj "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
<!ENTITY zwj "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
<!ENTITY lrm "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
<!ENTITY rlm "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
<!ENTITY ndash "&#8211;" ><!-- en dash, U+2013 ISOpub -->
<!ENTITY mdash "&#8212;" ><!-- em dash, U+2014 ISOpub -->
<!ENTITY lsquo "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
<!ENTITY rsquo "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
<!ENTITY sbquo "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
<!ENTITY ldquo "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
<!ENTITY rdquo "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
<!ENTITY bdquo "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
<!ENTITY dagger "&#8224;" ><!-- dagger, U+2020 ISOpub -->
<!ENTITY Dagger "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
<!ENTITY permil "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
<!-- lsaquo is proposed but not yet ISO standardized -->
<!ENTITY lsaquo "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
<!-- rsaquo is proposed but not yet ISO standardized -->
<!ENTITY rsaquo "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
<!ENTITY euro "&#8364;" ><!-- euro sign, U+20AC NEW -->
<!-- end of xhtml-special.ent -->
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
<!-- ...................................................................... -->
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
<!-- file: xhtml-symbol.ent
Typical invocation:
<!ENTITY % xhtml-symbol
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
"xhtml-symbol.ent" >
%xhtml-symbol;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- Latin Extended-B -->
<!ENTITY fnof "&#402;" ><!-- latin small f with hook = function
= florin, U+0192 ISOtech -->
<!-- Greek -->
<!ENTITY Alpha "&#913;" ><!-- greek capital letter alpha, U+0391 -->
<!ENTITY Beta "&#914;" ><!-- greek capital letter beta, U+0392 -->
<!ENTITY Gamma "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
<!ENTITY Delta "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
<!ENTITY Epsilon "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
<!ENTITY Zeta "&#918;" ><!-- greek capital letter zeta, U+0396 -->
<!ENTITY Eta "&#919;" ><!-- greek capital letter eta, U+0397 -->
<!ENTITY Theta "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
<!ENTITY Iota "&#921;" ><!-- greek capital letter iota, U+0399 -->
<!ENTITY Kappa "&#922;" ><!-- greek capital letter kappa, U+039A -->
<!ENTITY Lambda "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
<!ENTITY Mu "&#924;" ><!-- greek capital letter mu, U+039C -->
<!ENTITY Nu "&#925;" ><!-- greek capital letter nu, U+039D -->
<!ENTITY Xi "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
<!ENTITY Omicron "&#927;" ><!-- greek capital letter omicron, U+039F -->
<!ENTITY Pi "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
<!ENTITY Rho "&#929;" ><!-- greek capital letter rho, U+03A1 -->
<!-- there is no Sigmaf, and no U+03A2 character either -->
<!ENTITY Sigma "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
<!ENTITY Tau "&#932;" ><!-- greek capital letter tau, U+03A4 -->
<!ENTITY Upsilon "&#933;" ><!-- greek capital letter upsilon,
U+03A5 ISOgrk3 -->
<!ENTITY Phi "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
<!ENTITY Chi "&#935;" ><!-- greek capital letter chi, U+03A7 -->
<!ENTITY Psi "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
<!ENTITY Omega "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
<!ENTITY alpha "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
<!ENTITY beta "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
<!ENTITY gamma "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
<!ENTITY delta "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
<!ENTITY epsilon "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
<!ENTITY zeta "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
<!ENTITY eta "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
<!ENTITY theta "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
<!ENTITY iota "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
<!ENTITY kappa "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
<!ENTITY lambda "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
<!ENTITY mu "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
<!ENTITY nu "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
<!ENTITY xi "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
<!ENTITY omicron "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
<!ENTITY pi "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
<!ENTITY rho "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
<!ENTITY sigmaf "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
<!ENTITY sigma "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
<!ENTITY tau "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
<!ENTITY upsilon "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
<!ENTITY phi "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
<!ENTITY chi "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
<!ENTITY psi "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
<!ENTITY omega "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
<!ENTITY upsih "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
<!ENTITY piv "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
<!-- General Punctuation -->
<!ENTITY bull "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub -->
<!-- bullet is NOT the same as bullet operator, U+2219 -->
<!ENTITY hellip "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
<!ENTITY prime "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
<!ENTITY Prime "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
<!ENTITY oline "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
<!ENTITY frasl "&#8260;" ><!-- fraction slash, U+2044 NEW -->
<!-- Letterlike Symbols -->
<!ENTITY weierp "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
<!ENTITY image "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
<!ENTITY real "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
<!ENTITY trade "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
<!ENTITY alefsym "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
the same glyph could be used to depict both characters -->
<!-- Arrows -->
<!ENTITY larr "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
<!ENTITY uarr "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
<!ENTITY rarr "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
<!ENTITY darr "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
<!ENTITY harr "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
<!ENTITY crarr "&#8629;" ><!-- downwards arrow with corner leftwards
= carriage return, U+21B5 NEW -->
<!ENTITY lArr "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
but also does not have any other character for that function. So ? lArr can
be used for 'is implied by' as ISOtech suggests -->
<!ENTITY uArr "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
<!ENTITY rArr "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
<!-- Unicode does not say this is the 'implies' character but does not have
another character with this function so ?
rArr can be used for 'implies' as ISOtech suggests -->
<!ENTITY dArr "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
<!ENTITY hArr "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
<!-- Mathematical Operators -->
<!ENTITY forall "&#8704;" ><!-- for all, U+2200 ISOtech -->
<!ENTITY part "&#8706;" ><!-- partial differential, U+2202 ISOtech -->
<!ENTITY exist "&#8707;" ><!-- there exists, U+2203 ISOtech -->
<!ENTITY empty "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
<!ENTITY nabla "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
<!ENTITY isin "&#8712;" ><!-- element of, U+2208 ISOtech -->
<!ENTITY notin "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
<!ENTITY ni "&#8715;" ><!-- contains as member, U+220B ISOtech -->
<!-- should there be a more memorable name than 'ni'? -->
<!ENTITY prod "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
the same glyph might be used for both -->
<!ENTITY sum "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
though the same glyph might be used for both -->
<!ENTITY minus "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
<!ENTITY lowast "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
<!ENTITY radic "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
<!ENTITY prop "&#8733;" ><!-- proportional to, U+221D ISOtech -->
<!ENTITY infin "&#8734;" ><!-- infinity, U+221E ISOtech -->
<!ENTITY ang "&#8736;" ><!-- angle, U+2220 ISOamso -->
<!ENTITY and "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
<!ENTITY or "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
<!ENTITY cap "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
<!ENTITY cup "&#8746;" ><!-- union = cup, U+222A ISOtech -->
<!ENTITY int "&#8747;" ><!-- integral, U+222B ISOtech -->
<!ENTITY there4 "&#8756;" ><!-- therefore, U+2234 ISOtech -->
<!ENTITY sim "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
<!-- tilde operator is NOT the same character as the tilde, U+007E,
although the same glyph might be used to represent both -->
<!ENTITY cong "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
<!ENTITY asymp "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
<!ENTITY ne "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
<!ENTITY equiv "&#8801;" ><!-- identical to, U+2261 ISOtech -->
<!ENTITY le "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
<!ENTITY ge "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
<!ENTITY sub "&#8834;" ><!-- subset of, U+2282 ISOtech -->
<!ENTITY sup "&#8835;" ><!-- superset of, U+2283 ISOtech -->
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
font encoding and is not included. Should it be, for symmetry?
It is in ISOamsn -->
<!ENTITY nsub "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
<!ENTITY sube "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
<!ENTITY supe "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
<!ENTITY oplus "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
<!ENTITY otimes "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
<!ENTITY perp "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
<!ENTITY sdot "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
<!-- Miscellaneous Technical -->
<!ENTITY lceil "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
<!ENTITY rceil "&#8969;" ><!-- right ceiling, U+2309 ISOamsc -->
<!ENTITY lfloor "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc -->
<!ENTITY rfloor "&#8971;" ><!-- right floor, U+230B ISOamsc -->
<!ENTITY lang "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
<!-- lang is NOT the same character as U+003C 'less than'
or U+2039 'single left-pointing angle quotation mark' -->
<!ENTITY rang "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
<!-- rang is NOT the same character as U+003E 'greater than'
or U+203A 'single right-pointing angle quotation mark' -->
<!-- Geometric Shapes -->
<!ENTITY loz "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
<!-- Miscellaneous Symbols -->
<!ENTITY spades "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
<!-- black here seems to mean filled as opposed to hollow -->
<!ENTITY clubs "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
<!ENTITY hearts "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
<!ENTITY diams "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
<!-- end of xhtml-symbol.ent -->
"""
return text
def get_apache_license():
license = r"""/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
return license
main()

366
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/UTF32ToUTF8.py

@ -1,366 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import types
import os
import sys
import random
MAX_UNICODE = 0x10FFFF
# TODO
# - could be more minimal
# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges
# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does
# MASKS[0] is bottom 1-bit
# MASKS[1] is bottom 2-bits
# ...
utf8Ranges = [(0, 127),
(128, 2047),
(2048, 65535),
(65536, 1114111)]
typeToColor = {'startend': 'purple',
'start': 'blue',
'end': 'red'}
class FSA:
def __init__(self):
# maps fromNode -> (startUTF8, endUTF8, endNode)
self.states = {}
self.nodeUpto = 0
def run(self, bytes):
state = self.start
for b in bytes:
found = False
oldState = state
for label, s, e, n in self.states[state][1:]:
if b >= s and b <= e:
if found:
raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b))
state = n
found = True
if not found:
return -1
return state
def addEdge(self, n1, n2, v1, v2, label):
"""
Adds edge from n1-n2, utf8 byte range v1-v2.
"""
assert n1 in self.states
assert type(v1) is types.IntType
assert type(v2) is types.IntType
self.states[n1].append((label, v1, v2, n2))
def addNode(self, label=None):
try:
self.states[self.nodeUpto] = [label]
return self.nodeUpto
finally:
self.nodeUpto += 1
def toDOT(self, label):
__l = []
w = __l.append
endNode = startNode = None
for id, details in self.states.items():
name = details[0]
if name == 'end':
endNode = id
elif name == 'start':
startNode = id
w('digraph %s {' % label)
w(' rankdir=LR;')
w(' size="8,5";')
w(' node [color=white label=""]; Ns;')
w(' node [color=black];')
w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode))
w(' node [shape=circle];')
w(' N%s [label="%s"];' % (startNode, startNode))
w(' Ns -> N%s;' % startNode)
for id, details in self.states.items():
edges = details[1:]
w(' N%s [label="%s"];' % (id, id))
for type, s, e, dest in edges:
c = typeToColor.get(type, 'black')
if type == 'all*':
# special case -- matches any utf8 byte at this point
label = '*'
elif s == e:
label = '%s' % binary(s)
else:
label = '%s-%s' % (binary(s), binary(e))
w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c))
if name == 'end':
endNode = id
elif name == 'start':
startNode = id
w('}')
return '\n'.join(__l)
def toPNG(self, label, pngOut):
open('tmp.dot', 'wb').write(self.toDOT(label))
if os.system('dot -Tpng tmp.dot -o %s' % pngOut):
raise RuntimeException('dot failed')
MASKS = []
v = 2
for i in range(32):
MASKS.append(v-1)
v *= 2
def binary(x):
if x == 0:
return '00000000'
l = []
while x > 0:
if x & 1 == 1:
l.append('1')
else:
l.append('0')
x = x >> 1
# big endian!
l.reverse()
l2 = []
while len(l) > 0:
s = ''.join(l[-8:])
if len(s) < 8:
s = '0'*(8-len(s)) + s
l2.append(s)
del l[-8:]
return ' '.join(l2)
def getUTF8Rest(code, numBytes):
l = []
for i in range(numBytes):
l.append((128 | (code & MASKS[5]), 6))
code = code >> 6
l.reverse()
return tuple(l)
def toUTF8(code):
# code = Unicode code point
assert code >= 0
assert code <= MAX_UNICODE
if code < 128:
# 0xxxxxxx
bytes = ((code, 7),)
elif code < 2048:
# 110yyyxx 10xxxxxx
byte1 = (6 << 5) | (code >> 6)
bytes = ((byte1, 5),) + getUTF8Rest(code, 1)
elif code < 65536:
# 1110yyyy 10yyyyxx 10xxxxxx
len = 3
byte1 = (14 << 4) | (code >> 12)
bytes = ((byte1, 4),) + getUTF8Rest(code, 2)
else:
# 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
len = 4
byte1 = (30 << 3) | (code >> 18)
bytes = ((byte1, 3),) + getUTF8Rest(code, 3)
return bytes
def all(fsa, startNode, endNode, startCode, endCode, left):
if len(left) == 0:
fsa.addEdge(startNode, endNode, startCode, endCode, 'all')
else:
lastN = fsa.addNode()
fsa.addEdge(startNode, lastN, startCode, endCode, 'all')
while len(left) > 1:
n = fsa.addNode()
fsa.addEdge(lastN, n, 128, 191, 'all*')
left = left[1:]
lastN = n
fsa.addEdge(lastN, endNode, 128, 191, 'all*')
def start(fsa, startNode, endNode, utf8, doAll):
if len(utf8) == 1:
fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start')
else:
n = fsa.addNode()
fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start')
start(fsa, n, endNode, utf8[1:], True)
end = utf8[0][0] | MASKS[utf8[0][1]-1]
if doAll and utf8[0][0] != end:
all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:])
def end(fsa, startNode, endNode, utf8, doAll):
if len(utf8) == 1:
fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end')
else:
if utf8[0][1] == 5:
# special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences):
start = 194
else:
start = utf8[0][0] & (~MASKS[utf8[0][1]-1])
if doAll and utf8[0][0] != start:
all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:])
n = fsa.addNode()
fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end')
end(fsa, n, endNode, utf8[1:], True)
def build(fsa,
startNode, endNode,
startUTF8, endUTF8):
# Break into start, middle, end:
if startUTF8[0][0] == endUTF8[0][0]:
# Degen case: lead with the same byte:
if len(startUTF8) == 1 and len(endUTF8) == 1:
fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend')
return
else:
assert len(startUTF8) != 1
assert len(endUTF8) != 1
n = fsa.addNode()
# single value edge
fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single')
build(fsa, n, endNode, startUTF8[1:], endUTF8[1:])
elif len(startUTF8) == len(endUTF8):
if len(startUTF8) == 1:
fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend')
else:
start(fsa, startNode, endNode, startUTF8, False)
if endUTF8[0][0] - startUTF8[0][0] > 1:
all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:])
end(fsa, startNode, endNode, endUTF8, False)
else:
# start
start(fsa, startNode, endNode, startUTF8, True)
# possibly middle
byteCount = 1+len(startUTF8)
while byteCount < len(endUTF8):
s = toUTF8(utf8Ranges[byteCount-1][0])
e = toUTF8(utf8Ranges[byteCount-1][1])
all(fsa, startNode, endNode,
s[0][0],
e[0][0],
s[1:])
byteCount += 1
# end
end(fsa, startNode, endNode, endUTF8, True)
def main():
if len(sys.argv) not in (3, 4):
print
print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0]
print
sys.exit(1)
utf32Start = int(sys.argv[1])
utf32End = int(sys.argv[2])
if utf32Start > utf32End:
print 'ERROR: start must be <= end'
sys.exit(1)
fsa = FSA()
fsa.start = fsa.addNode('start')
fsa.end = fsa.addNode('end')
print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)])
print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)])
if len(sys.argv) == 4:
print 't=%s [%s]' % \
(' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]),
' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))]))
build(fsa, fsa.start, fsa.end,
toUTF8(utf32Start),
toUTF8(utf32End))
fsa.toPNG('test', '/tmp/outpy.png')
print 'Saved to /tmp/outpy.png...'
test(fsa, utf32Start, utf32End, 100000);
def test(fsa, utf32Start, utf32End, count):
# verify correct ints are accepted
for i in range(count):
r = random.randint(utf32Start, utf32End)
dest = fsa.run([tup[0] for tup in toUTF8(r)])
if dest != fsa.end:
print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)]))
return False
invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1)
if invalidRange >= 0:
# verify invalid ints are not accepted
for i in range(count):
r = random.randint(0, invalidRange-1)
if r >= utf32Start:
r = utf32End + 1 + r - utf32Start
dest = fsa.run([tup[0] for tup in toUTF8(r)])
if dest != -1:
print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)]))
return False
return True
def stress():
print 'Testing...'
iter = 0
while True:
if iter % 10 == 0:
print '%s...' % iter
iter += 1
v1 = random.randint(0, MAX_UNICODE)
v2 = random.randint(0, MAX_UNICODE)
if v2 < v1:
v1, v2 = v2, v1
utf32Start = v1
utf32End = v2
fsa = FSA()
fsa.start = fsa.addNode('start')
fsa.end = fsa.addNode('end')
build(fsa, fsa.start, fsa.end,
toUTF8(utf32Start),
toUTF8(utf32End))
if not test(fsa, utf32Start, utf32End, 10000):
print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End)
if __name__ == '__main__':
if len(sys.argv) > 1:
main()
else:
stress()

500
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/createLevAutomata.py

@ -1,500 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Note, this file is known to work with rev 120 of the moman
# repository (http://bitbucket.org/jpbarrette/moman/overview)
#
# See also: http://sites.google.com/site/rrettesite/moman
import math
import os
import sys
#sys.path.insert(0, 'moman/finenight/python')
sys.path.insert(0, '../../../../../../../../build/core/moman/finenight/python')
try:
from possibleStates import genTransitions
except ImportError:
from finenight.possibleStates import genTransitions
MODE = 'array'
PACKED = True
WORD = 64
LOG2_WORD = int(math.log(WORD)/math.log(2))
#MODE = 'switch'
class LineOutput:
def __init__(self, indent=''):
self.l = []
self._indent = self.startIndent = indent
self.inComment = False
def __call__(self, s, indent=0):
if s.find('}') != -1:
assert self._indent != self.startIndent
self._indent = self._indent[:-2]
if indent != 0:
indent0 = ' ' * (len(self._indent)/2+indent)
else:
indent0 = self._indent
if s.find('/*') != -1:
if s.find('*/') == -1:
self.inComment = True
elif s.find('*/') != -1:
self.inComment = True
if self.inComment:
self.l.append(indent0 + s)
else:
self.l.append(indent0 + s.lstrip())
self.inComment = self.inComment and s.find('*/') == -1
if s.find('{') != -1:
self._indent += ' '
def __str__(self):
if True:
assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \
(len(self._indent), len(self.startIndent))
return '\n'.join(self.l)
def indent(self):
self._indent += ' '
def outdent(self):
assert self._indent != self.startIndent
self._indent = self._indent[:-2]
def charVarNumber(charVar):
"""
Maps binary number (eg [1, 0, 1]) to its decimal value (5).
"""
p = 1
sum = 0
downTo = len(charVar)-1
while downTo >= 0:
sum += p * int(charVar[downTo])
p *= 2
downTo -= 1
return sum
def main():
if len(sys.argv) != 3:
print
print 'Usage: python -u %s N <True/False>' % sys.argv[0]
print
print 'NOTE: the resulting .java file is created in the current working dir!'
print
sys.exit(1)
n = int(sys.argv[1])
transpose = (sys.argv[2] == "True")
tables = genTransitions(n, transpose)
stateMap = {}
# init null state
stateMap['[]'] = -1
# init start state
stateMap['[(0, 0)]'] = 0
w = LineOutput()
w('package com.fr.third.org.apache.lucene.util.automaton;')
w('')
w('/*')
w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
w(' * contributor license agreements. See the NOTICE file distributed with')
w(' * this work for additional information regarding copyright ownership.')
w(' * The ASF licenses this file to You under the Apache License, Version 2.0')
w(' * (the "License"); you may not use this file except in compliance with')
w(' * the License. You may obtain a copy of the License at')
w(' *')
w(' * http://www.apache.org/licenses/LICENSE-2.0')
w(' *')
w(' * Unless required by applicable law or agreed to in writing, software')
w(' * distributed under the License is distributed on an "AS IS" BASIS,')
w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.')
w(' * See the License for the specific language governing permissions and')
w(' * limitations under the License.')
w(' */')
w('')
w('// The following code was generated with the moman/finenight pkg')
w('// This package is available under the MIT License, see NOTICE.txt')
w('// for more details.')
w('')
w('import com.fr.third.org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
w('')
if transpose:
w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n)
w(' with transpositions as primitive edits */')
className = 'Lev%dTParametricDescription' % n
else:
w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
className = 'Lev%dParametricDescription' % n
w('class %s extends ParametricDescription {' % className)
w('')
w('@Override')
w('int transition(int absState, int position, int vector) {')
w(' // null absState should never be passed in')
w(' assert absState != -1;')
w('')
w(' // decode absState -> state, offset')
w(' int state = absState/(w+1);')
w(' int offset = absState%(w+1);')
w(' assert offset >= 0;')
w('')
machines = []
for i, map in enumerate(tables):
if i == 0:
w('if (position == w) {')
elif i == len(tables)-1:
w('} else {')
else:
w('} else if (position == w-%d) {' % i)
if i != 0 and MODE == 'switch':
w('switch(vector) {')
l = map.items()
l.sort()
numCasesPerVector = None
numVectors = len(l)
if MODE == 'array':
toStateArray = []
toOffsetIncrArray = []
for charVar, states in l:
# somehow it's a string:
charVar = eval(charVar)
if i != 0 and MODE == 'switch':
w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar])))
w.indent()
l = states.items()
byFromState = {}
# first pass to assign states
byAction = {}
for s, (toS, offset) in l:
state = str(s)
toState = str(toS)
if state not in stateMap:
stateMap[state] = len(stateMap)-1
if toState not in stateMap:
stateMap[toState] = len(stateMap)-1
byFromState[stateMap[state]] = (1+stateMap[toState], offset)
fromStateDesc = s[1:len(s)-1]
toStateDesc = ', '.join([str(x) for x in toS])
tup = (stateMap[toState], toStateDesc, offset)
if tup not in byAction:
byAction[tup] = []
byAction[tup].append((fromStateDesc, stateMap[state]))
if numCasesPerVector is None:
numCasesPerVector = len(l)
else:
# we require this to be uniform... empirically it seems to be!
assert numCasesPerVector == len(l)
if MODE == 'array':
for s in range(numCasesPerVector):
toState, offsetIncr = byFromState[s]
toStateArray.append(toState)
toOffsetIncrArray.append(offsetIncr)
else:
# render switches
w('switch(state) { // %s cases' % len(l))
for (toState, toStateDesc, offset), lx in byAction.items():
for fromStateDesc, fromState in lx:
w('case %s: // %s' % (fromState, fromStateDesc))
w.indent()
w(' state = %s; // %s' % (toState, toStateDesc))
if offset > 0:
w(' offset += %s;' % offset)
w('break;')
w.outdent()
w('}')
if i != 0:
w('break;')
w.outdent()
if MODE == 'array':
# strangely state can come in wildly out of bounds....
w(' if (state < %d) {' % numCasesPerVector)
w(' final int loc = vector * %d + state;' % numCasesPerVector)
if PACKED:
w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i))
w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i))
else:
w(' offset += offsetIncrs%d[loc];' % i)
w(' state = toStates%d[loc]-1;' % i)
w(' }')
elif i != 0:
w('}')
machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors))
# ends switch statement for machine
w('}')
w('')
w(' if (state == -1) {')
w(' // null state')
w(' return -1;')
w(' } else {')
w(' // translate back to abs')
w(' return state*(w+1)+offset;')
w(' }')
# ends transition method
w('}')
subs = []
if MODE == 'array':
w.indent()
for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
w('')
w.outdent()
w('// %d vectors; %d states per vector; array length = %d' % \
(numVectors, numCasesPerVector, numVectors*numCasesPerVector))
w.indent()
if PACKED:
# pack in python
l, nbits = pack(toStateArray)
subs.append(('NBITSSTATES%d' % i, str(nbits)))
w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \
(i, nbits, renderList([hex(long(x)) for x in l])))
l, nbits = pack(toOffsetIncrsArray)
subs.append(('NBITSOFFSET%d' % i, str(nbits)))
w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \
(i, nbits, renderList([hex(long(x)) for x in l])))
else:
w(' private final static int[] toStates%d = new int[] %s;' % \
(i, renderList([str(x) for x in toStateArray])))
w(' private final static int[] offsetIncrs%d = new int[] %s;' % \
(i, renderList([str(x) for x in toStateArray])))
w.outdent()
stateMap2 = dict([[v,k] for k,v in stateMap.items()])
w('')
w('// state map')
sum = 0
minErrors = []
for i in xrange(len(stateMap2)-1):
w('// %s -> %s' % (i, stateMap2[i]))
# we replace t-notation as its not relevant here
st = stateMap2[i].replace('t', '')
v = eval(st)
minError = min([-i+e for i, e in v])
c = len(v)
sum += c
minErrors.append(minError)
w('')
w.indent()
#w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors]))
w.outdent()
w('')
w(' public %s(int w) {' % className)
w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1)
w(' }')
if 0:
w('')
w('@Override')
w('public int size() { // this can now move up?')
w(' return %d*(w+1);' % (len(stateMap2)-1))
w('}')
w('')
w('@Override')
w('public int getPosition(int absState) { // this can now move up?')
w(' return absState % (w+1);')
w('}')
w('')
w('@Override')
w('public boolean isAccept(int absState) { // this can now move up?')
w(' // decode absState -> state, offset')
w(' int state = absState/(w+1);')
w(' if (true || state < minErrors.length) {')
w(' int offset = absState%(w+1);')
w(' assert offset >= 0;')
w(' return w - offset + minErrors[state] <= %d;' % n)
w(' } else {')
w(' return false;')
w(' }')
w('}')
if MODE == 'array' and PACKED:
# we moved into super class
if False:
w('')
v = 2
l = []
for i in range(63):
l.append(hex(v-1))
v *= 2
w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1)
w('')
# unpack in java
w('private int unpack(long[] data, int index, int bitsPerValue) {')
w(' final long bitLoc = bitsPerValue * index;')
w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD)
w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1))
w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);')
w(' if (bitStart + bitsPerValue <= %d) {' % WORD)
w(' // not split')
w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);')
w(' } else {')
w(' // split')
w(' final int part = %d-bitStart;' % WORD)
w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +')
w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1)
w(' }')
w('}')
# class
w('}')
w('')
fileOut = '%s.java' % className
s = str(w)
for sub, repl in subs:
s = s.replace(sub, repl)
open(fileOut, 'wb').write(s)
print 'Wrote %s [%d lines; %.1f KB]' % \
(fileOut, len(w.l), os.path.getsize(fileOut)/1024.)
def renderList(l):
lx = [' ']
for i in xrange(len(l)):
if i > 0:
lx.append(',')
if i % 4 == 0:
lx.append('\n ')
lx.append(l[i])
return '{\n%s\n }' % ''.join(lx)
MASKS = []
v = 2
for i in xrange(63):
MASKS.append(v-1)
v *= 2
# packs into longs; returns long[], numBits
def pack(l):
maxV = max(l)
bitsPerValue = max(1, int(math.ceil(math.log(maxV+1)/math.log(2.0))))
bitsLeft = WORD
pendingValue = 0
packed = []
for i in xrange(len(l)):
v = l[i]
if pendingValue > 0:
bitsUsed = math.ceil(math.log(pendingValue)/math.log(2.0))
assert bitsUsed <= (WORD-bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD-bitsLeft, bitsUsed)
if bitsLeft >= bitsPerValue:
pendingValue += v << (WORD-bitsLeft)
bitsLeft -= bitsPerValue
if bitsLeft == 0:
packed.append(pendingValue)
bitsLeft = WORD
pendingValue = 0
else:
# split
# bottom bitsLeft go in current word:
pendingValue += (v & MASKS[bitsLeft-1]) << (WORD-bitsLeft)
packed.append(pendingValue)
pendingValue = v >> bitsLeft
bitsLeft = WORD - (bitsPerValue-bitsLeft)
if bitsLeft < WORD:
packed.append(pendingValue)
# verify(l, packed, bitsPerValue)
return packed, bitsPerValue
def verify(data, packedData, bitsPerValue):
for i in range(len(data)):
assert data[i] == unpack(packedData, i, bitsPerValue)
def unpack(data, index, bitsPerValue):
bitLoc = bitsPerValue * index
dataLoc = int(bitLoc >> LOG2_WORD)
bitStart = int(bitLoc & (WORD-1))
if bitStart + bitsPerValue <= WORD:
# not split
return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]))
else:
# split
part = WORD-bitStart;
return int((((data[dataLoc] >> bitStart) & MASKS[part-1]) +
((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part)))
if __name__ == '__main__':
if not __debug__:
print
print 'ERROR: please run without -O'
print
sys.exit(1)
main()

335
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_BulkOperation.py

@ -1,335 +0,0 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from fractions import gcd
"""Code generation for bulk operations"""
MAX_SPECIALIZED_BITS_PER_VALUE = 24;
PACKED_64_SINGLE_BLOCK_BPV = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32]
OUTPUT_FILE = "BulkOperation.java"
HEADER = """// This file has been automatically generated, DO NOT EDIT
package com.fr.third.org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
FOOTER="""
protected int writeLong(long block, byte[] blocks, int blocksOffset) {
for (int j = 1; j <= 8; ++j) {
blocks[blocksOffset++] = (byte) (block >>> (64 - (j << 3)));
}
return blocksOffset;
}
/**
* For every number of bits per value, there is a minimum number of
* blocks (b) / values (v) you need to write in order to reach the next block
* boundary:
* - 16 bits per value -> b=1, v=4
* - 24 bits per value -> b=3, v=8
* - 50 bits per value -> b=25, v=32
* - 63 bits per value -> b=63, v=64
* - ...
*
* A bulk read consists in copying <code>iterations*v</code> values that are
* contained in <code>iterations*b</code> blocks into a <code>long[]</code>
* (higher values of <code>iterations</code> are likely to yield a better
* throughput) => this requires n * (b + v) longs in memory.
*
* This method computes <code>iterations</code> as
* <code>ramBudget / (8 * (b + v))</code> (since a long is 8 bytes).
*/
public final int computeIterations(int valueCount, int ramBudget) {
final int iterations = (ramBudget >>> 3) / (blockCount() + valueCount());
if (iterations == 0) {
// at least 1
return 1;
} else if ((iterations - 1) * blockCount() >= valueCount) {
// don't allocate for more than the size of the reader
return (int) Math.ceil((double) valueCount / valueCount());
} else {
return iterations;
}
}
}
"""
def is_power_of_two(n):
return n & (n - 1) == 0
def casts(typ):
cast_start = "(%s) (" %typ
cast_end = ")"
if typ == "long":
cast_start = ""
cast_end = ""
return cast_start, cast_end
def hexNoLSuffix(n):
# On 32 bit Python values > (1 << 31)-1 will have L appended by hex function:
s = hex(n)
if s.endswith('L'):
s = s[:-1]
return s
def masks(bits):
if bits == 64:
return "", ""
return "(", " & %sL)" %(hexNoLSuffix((1 << bits) - 1))
def get_type(bits):
if bits == 8:
return "byte"
elif bits == 16:
return "short"
elif bits == 32:
return "int"
elif bits == 64:
return "long"
else:
assert False
def block_value_count(bpv, bits=64):
blocks = bpv
values = blocks * bits / bpv
while blocks % 2 == 0 and values % 2 == 0:
blocks /= 2
values /= 2
assert values * bpv == bits * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv)
return (blocks, values)
def packed64(bpv, f):
blocks, values = block_value_count(bpv)
mask = (1 << bpv) - 1
f.write("\n")
f.write(" public BulkOperationPacked%d() {\n" %bpv)
f.write(" super(%d);\n" %bpv)
f.write(" assert blockCount() == %d;\n" %blocks)
f.write(" assert valueCount() == %d;\n" %values)
f.write(" }\n\n")
if bpv == 64:
f.write(""" @Override
public void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) {
System.arraycopy(blocks, blocksOffset, values, valuesOffset, valueCount() * iterations);
}
@Override
public void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) {
throw new UnsupportedOperationException();
}
@Override
public void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) {
throw new UnsupportedOperationException();
}
@Override
public void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) {
LongBuffer.wrap(values, valuesOffset, iterations * valueCount()).put(ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer());
}
""")
else:
p64_decode(bpv, f, 32)
p64_decode(bpv, f, 64)
def p64_decode(bpv, f, bits):
blocks, values = block_value_count(bpv)
typ = get_type(bits)
cast_start, cast_end = casts(typ)
f.write(" @Override\n")
f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
if bits < bpv:
f.write(" throw new UnsupportedOperationException();\n")
else:
f.write(" for (int i = 0; i < iterations; ++i) {\n")
mask = (1 << bpv) - 1
if is_power_of_two(bpv):
f.write(" final long block = blocks[blocksOffset++];\n")
f.write(" for (int shift = %d; shift >= 0; shift -= %d) {\n" %(64 - bpv, bpv))
f.write(" values[valuesOffset++] = %s(block >>> shift) & %d%s;\n" %(cast_start, mask, cast_end))
f.write(" }\n")
else:
for i in xrange(0, values):
block_offset = i * bpv / 64
bit_offset = (i * bpv) % 64
if bit_offset == 0:
# start of block
f.write(" final long block%d = blocks[blocksOffset++];\n" %block_offset);
f.write(" values[valuesOffset++] = %sblock%d >>> %d%s;\n" %(cast_start, block_offset, 64 - bpv, cast_end))
elif bit_offset + bpv == 64:
# end of block
f.write(" values[valuesOffset++] = %sblock%d & %dL%s;\n" %(cast_start, block_offset, mask, cast_end))
elif bit_offset + bpv < 64:
# middle of block
f.write(" values[valuesOffset++] = %s(block%d >>> %d) & %dL%s;\n" %(cast_start, block_offset, 64 - bit_offset - bpv, mask, cast_end))
else:
# value spans across 2 blocks
mask1 = (1 << (64 - bit_offset)) -1
shift1 = bit_offset + bpv - 64
shift2 = 64 - shift1
f.write(" final long block%d = blocks[blocksOffset++];\n" %(block_offset + 1));
f.write(" values[valuesOffset++] = %s((block%d & %dL) << %d) | (block%d >>> %d)%s;\n" %(cast_start, block_offset, mask1, shift1, block_offset + 1, shift2, cast_end))
f.write(" }\n")
f.write(" }\n\n")
byte_blocks, byte_values = block_value_count(bpv, 8)
f.write(" @Override\n")
f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ)
if bits < bpv:
f.write(" throw new UnsupportedOperationException();\n")
else:
if is_power_of_two(bpv) and bpv < 8:
f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n")
f.write(" final byte block = blocks[blocksOffset++];\n")
for shift in xrange(8 - bpv, 0, -bpv):
f.write(" values[valuesOffset++] = (block >>> %d) & %d;\n" %(shift, mask))
f.write(" values[valuesOffset++] = block & %d;\n" %mask)
f.write(" }\n")
elif bpv == 8:
f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n")
f.write(" values[valuesOffset++] = blocks[blocksOffset++] & 0xFF;\n")
f.write(" }\n")
elif is_power_of_two(bpv) and bpv > 8:
f.write(" for (int j = 0; j < %d * iterations; ++j) {\n" %(64 / bpv))
m = bits <= 32 and "0xFF" or "0xFFL"
f.write(" values[valuesOffset++] =")
for i in xrange(bpv / 8 - 1):
f.write(" ((blocks[blocksOffset++] & %s) << %d) |" %(m, bpv - 8))
f.write(" (blocks[blocksOffset++] & %s);\n" %m)
f.write(" }\n")
else:
f.write(" for (int i = 0; i < 8 * iterations; ++i) {\n")
for i in xrange(0, byte_values):
byte_start = i * bpv / 8
bit_start = (i * bpv) % 8
byte_end = ((i + 1) * bpv - 1) / 8
bit_end = ((i + 1) * bpv - 1) % 8
shift = lambda b: 8 * (byte_end - b - 1) + 1 + bit_end
if bit_start == 0:
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, byte_start))
for b in xrange(byte_start + 1, byte_end + 1):
f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, b))
f.write(" values[valuesOffset++] =")
if byte_start == byte_end:
if bit_start == 0:
if bit_end == 7:
f.write(" byte%d" %byte_start)
else:
f.write(" byte%d >>> %d" %(byte_start, 7 - bit_end))
else:
if bit_end == 7:
f.write(" byte%d & %d" %(byte_start, 2 ** (8 - bit_start) - 1))
else:
f.write(" (byte%d >>> %d) & %d" %(byte_start, 7 - bit_end, 2 ** (bit_end - bit_start + 1) - 1))
else:
if bit_start == 0:
f.write(" (byte%d << %d)" %(byte_start, shift(byte_start)))
else:
f.write(" ((byte%d & %d) << %d)" %(byte_start, 2 ** (8 - bit_start) - 1, shift(byte_start)))
for b in xrange(byte_start + 1, byte_end):
f.write(" | (byte%d << %d)" %(b, shift(b)))
if bit_end == 7:
f.write(" | byte%d" %byte_end)
else:
f.write(" | (byte%d >>> %d)" %(byte_end, 7 - bit_end))
f.write(";\n")
f.write(" }\n")
f.write(" }\n\n")
if __name__ == '__main__':
f = open(OUTPUT_FILE, 'w')
f.write(HEADER)
f.write('\n')
f.write('''/**
* Efficient sequential read/write of packed integers.
*/\n''')
f.write('abstract class BulkOperation implements PackedInts.Decoder, PackedInts.Encoder {\n')
f.write(' private static final BulkOperation[] packedBulkOps = new BulkOperation[] {\n')
for bpv in xrange(1, 65):
if bpv > MAX_SPECIALIZED_BITS_PER_VALUE:
f.write(' new BulkOperationPacked(%d),\n' % bpv)
continue
f2 = open('BulkOperationPacked%d.java' % bpv, 'w')
f2.write(HEADER)
if bpv == 64:
f2.write('import java.nio.LongBuffer;\n')
f2.write('import java.nio.ByteBuffer;\n')
f2.write('\n')
f2.write('''/**
* Efficient sequential read/write of packed integers.
*/\n''')
f2.write('final class BulkOperationPacked%d extends BulkOperationPacked {\n' % bpv)
packed64(bpv, f2)
f2.write('}\n')
f2.close()
f.write(' new BulkOperationPacked%d(),\n' % bpv)
f.write(' };\n')
f.write('\n')
f.write(' // NOTE: this is sparse (some entries are null):\n')
f.write(' private static final BulkOperation[] packedSingleBlockBulkOps = new BulkOperation[] {\n')
for bpv in xrange(1, max(PACKED_64_SINGLE_BLOCK_BPV)+1):
if bpv in PACKED_64_SINGLE_BLOCK_BPV:
f.write(' new BulkOperationPackedSingleBlock(%d),\n' % bpv)
else:
f.write(' null,\n')
f.write(' };\n')
f.write('\n')
f.write("\n")
f.write(" public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {\n")
f.write(" switch (format) {\n")
f.write(" case PACKED:\n")
f.write(" assert packedBulkOps[bitsPerValue - 1] != null;\n")
f.write(" return packedBulkOps[bitsPerValue - 1];\n")
f.write(" case PACKED_SINGLE_BLOCK:\n")
f.write(" assert packedSingleBlockBulkOps[bitsPerValue - 1] != null;\n")
f.write(" return packedSingleBlockBulkOps[bitsPerValue - 1];\n")
f.write(" default:\n")
f.write(" throw new AssertionError();\n")
f.write(" }\n")
f.write(" }\n")
f.write(FOOTER)
f.close()

175
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Direct.py

@ -1,175 +0,0 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
HEADER="""// This file has been automatically generated, DO NOT EDIT
package com.fr.third.org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.store.DataInput;
import com.fr.third.org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException;
import java.util.Arrays;
"""
TYPES = {8: "byte", 16: "short", 32: "int", 64: "long"}
MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""}
CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""}
if __name__ == '__main__':
for bpv in TYPES.keys():
type
f = open("Direct%d.java" %bpv, 'w')
f.write(HEADER)
f.write("""/**
* Direct wrapping of %d-bits values to a backing array.
* @lucene.internal
*/\n""" %bpv)
f.write("final class Direct%d extends PackedInts.MutableImpl {\n" %bpv)
f.write(" final %s[] values;\n\n" %TYPES[bpv])
f.write(" Direct%d(int valueCount) {\n" %bpv)
f.write(" super(valueCount, %d);\n" %bpv)
f.write(" values = new %s[valueCount];\n" %TYPES[bpv])
f.write(" }\n\n")
f.write(" Direct%d(DataInput in, int valueCount) throws IOException {\n" %bpv)
f.write(" this(valueCount);\n")
f.write(" for (int i = 0; i < valueCount; ++i) {\n")
f.write(" values[i] = in.read%s();\n" %TYPES[bpv].title())
f.write(" }\n")
if bpv != 64:
f.write(" final int mod = valueCount %% %d;\n" %(64 / bpv))
f.write(" if (mod != 0) {\n")
f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv))
f.write(" in.read%s();\n" %TYPES[bpv].title())
f.write(" }\n")
f.write(" }\n")
f.write(" }\n")
f.write("""
@Override
public long get(final int index) {
return values[index]%s;
}
public void set(final int index, final long value) {
values[index] = %s(value);
}
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(values);
}
public void clear() {
Arrays.fill(values, %s0L);
}
@Override
public Object getArray() {
return values;
}
@Override
public boolean hasArray() {
return true;
}
""" %(MASKS[bpv], CASTS[bpv], CASTS[bpv]))
if bpv == 64:
f.write("""
@Override
public int get(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
assert off + len <= arr.length;
final int gets = Math.min(valueCount - index, len);
System.arraycopy(values, index, arr, off, gets);
return gets;
}
public int set(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
assert off + len <= arr.length;
final int sets = Math.min(valueCount - index, len);
System.arraycopy(arr, off, values, index, sets);
return sets;
}
@Override
public void fill(int fromIndex, int toIndex, long val) {
Arrays.fill(values, fromIndex, toIndex, val);
}
""")
else:
f.write("""
@Override
public int get(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
assert off + len <= arr.length;
final int gets = Math.min(valueCount - index, len);
for (int i = index, o = off, end = index + gets; i < end; ++i, ++o) {
arr[o] = values[i]%s;
}
return gets;
}
public int set(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
assert off + len <= arr.length;
final int sets = Math.min(valueCount - index, len);
for (int i = index, o = off, end = index + sets; i < end; ++i, ++o) {
values[i] = %sarr[o];
}
return sets;
}
@Override
public void fill(int fromIndex, int toIndex, long val) {
assert val == (val%s);
Arrays.fill(values, fromIndex, toIndex, %sval);
}
""" %(MASKS[bpv], CASTS[bpv], MASKS[bpv], CASTS[bpv]))
f.write("}\n")
f.close()

291
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Packed64SingleBlock.py

@ -1,291 +0,0 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SUPPORTED_BITS_PER_VALUE = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32]
HEADER="""// This file has been automatically generated, DO NOT EDIT
package com.fr.third.org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
import java.io.IOException;
import java.util.Arrays;
import com.fr.third.org.apache.lucene.store.DataInput;
import com.fr.third.org.apache.lucene.util.RamUsageEstimator;
/**
* This class is similar to {@link Packed64} except that it trades space for
* speed by ensuring that a single block needs to be read/written in order to
* read/write a value.
*/
abstract class Packed64SingleBlock extends PackedInts.MutableImpl {
public static final int MAX_SUPPORTED_BITS_PER_VALUE = %d;
private static final int[] SUPPORTED_BITS_PER_VALUE = new int[] {%s};
public static boolean isSupported(int bitsPerValue) {
return Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) >= 0;
}
private static int requiredCapacity(int valueCount, int valuesPerBlock) {
return valueCount / valuesPerBlock
+ (valueCount %% valuesPerBlock == 0 ? 0 : 1);
}
final long[] blocks;
Packed64SingleBlock(int valueCount, int bitsPerValue) {
super(valueCount, bitsPerValue);
assert isSupported(bitsPerValue);
final int valuesPerBlock = 64 / bitsPerValue;
blocks = new long[requiredCapacity(valueCount, valuesPerBlock)];
}
@Override
public void clear() {
Arrays.fill(blocks, 0L);
}
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(blocks);
}
@Override
public int get(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
len = Math.min(len, valueCount - index);
assert off + len <= arr.length;
final int originalIndex = index;
// go to the next block boundary
final int valuesPerBlock = 64 / bitsPerValue;
final int offsetInBlock = index %% valuesPerBlock;
if (offsetInBlock != 0) {
for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) {
arr[off++] = get(index++);
--len;
}
if (len == 0) {
return index - originalIndex;
}
}
// bulk get
assert index %% valuesPerBlock == 0;
final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
assert decoder.blockCount() == 1;
assert decoder.valueCount() == valuesPerBlock;
final int blockIndex = index / valuesPerBlock;
final int nblocks = (index + len) / valuesPerBlock - blockIndex;
decoder.decode(blocks, blockIndex, arr, off, nblocks);
final int diff = nblocks * valuesPerBlock;
index += diff; len -= diff;
if (index > originalIndex) {
// stay at the block boundary
return index - originalIndex;
} else {
// no progress so far => already at a block boundary but no full block to
// get
assert index == originalIndex;
return super.get(index, arr, off, len);
}
}
@Override
public int set(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
len = Math.min(len, valueCount - index);
assert off + len <= arr.length;
final int originalIndex = index;
// go to the next block boundary
final int valuesPerBlock = 64 / bitsPerValue;
final int offsetInBlock = index %% valuesPerBlock;
if (offsetInBlock != 0) {
for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) {
set(index++, arr[off++]);
--len;
}
if (len == 0) {
return index - originalIndex;
}
}
// bulk set
assert index %% valuesPerBlock == 0;
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
assert op.blockCount() == 1;
assert op.valueCount() == valuesPerBlock;
final int blockIndex = index / valuesPerBlock;
final int nblocks = (index + len) / valuesPerBlock - blockIndex;
op.encode(arr, off, blocks, blockIndex, nblocks);
final int diff = nblocks * valuesPerBlock;
index += diff; len -= diff;
if (index > originalIndex) {
// stay at the block boundary
return index - originalIndex;
} else {
// no progress so far => already at a block boundary but no full block to
// set
assert index == originalIndex;
return super.set(index, arr, off, len);
}
}
@Override
public void fill(int fromIndex, int toIndex, long val) {
assert fromIndex >= 0;
assert fromIndex <= toIndex;
assert PackedInts.bitsRequired(val) <= bitsPerValue;
final int valuesPerBlock = 64 / bitsPerValue;
if (toIndex - fromIndex <= valuesPerBlock << 1) {
// there needs to be at least one full block to set for the block
// approach to be worth trying
super.fill(fromIndex, toIndex, val);
return;
}
// set values naively until the next block start
int fromOffsetInBlock = fromIndex %% valuesPerBlock;
if (fromOffsetInBlock != 0) {
for (int i = fromOffsetInBlock; i < valuesPerBlock; ++i) {
set(fromIndex++, val);
}
assert fromIndex %% valuesPerBlock == 0;
}
// bulk set of the inner blocks
final int fromBlock = fromIndex / valuesPerBlock;
final int toBlock = toIndex / valuesPerBlock;
assert fromBlock * valuesPerBlock == fromIndex;
long blockValue = 0L;
for (int i = 0; i < valuesPerBlock; ++i) {
blockValue = blockValue | (val << (i * bitsPerValue));
}
Arrays.fill(blocks, fromBlock, toBlock, blockValue);
// fill the gap
for (int i = valuesPerBlock * toBlock; i < toIndex; ++i) {
set(i, val);
}
}
@Override
protected PackedInts.Format getFormat() {
return PackedInts.Format.PACKED_SINGLE_BLOCK;
}
@Override
public String toString() {
return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue
+ ", size=" + size() + ", elements.length=" + blocks.length + ")";
}
public static Packed64SingleBlock create(DataInput in,
int valueCount, int bitsPerValue) throws IOException {
Packed64SingleBlock reader = create(valueCount, bitsPerValue);
for (int i = 0; i < reader.blocks.length; ++i) {
reader.blocks[i] = in.readLong();
}
return reader;
}
""" %(SUPPORTED_BITS_PER_VALUE[-1], ", ".join(map(str, SUPPORTED_BITS_PER_VALUE)))
FOOTER = "}"
if __name__ == '__main__':
f = open("Packed64SingleBlock.java", 'w')
f.write(HEADER)
f.write(" public static Packed64SingleBlock create(int valueCount, int bitsPerValue) {\n")
f.write(" switch (bitsPerValue) {\n")
for bpv in SUPPORTED_BITS_PER_VALUE:
f.write(" case %d:\n" %bpv)
f.write(" return new Packed64SingleBlock%d(valueCount);\n" %bpv)
f.write(" default:\n")
f.write(" throw new IllegalArgumentException(\"Unsupported number of bits per value: \" + %d);\n" %bpv)
f.write(" }\n")
f.write(" }\n\n")
for bpv in SUPPORTED_BITS_PER_VALUE:
log_2 = 0
while (1 << log_2) < bpv:
log_2 = log_2 + 1
if (1 << log_2) != bpv:
log_2 = None
f.write(" static class Packed64SingleBlock%d extends Packed64SingleBlock {\n\n" %bpv)
f.write(" Packed64SingleBlock%d(int valueCount) {\n" %bpv)
f.write(" super(valueCount, %d);\n" %bpv)
f.write(" }\n\n")
f.write(" @Override\n")
f.write(" public long get(int index) {\n")
if log_2 is not None:
f.write(" final int o = index >>> %d;\n" %(6 - log_2))
f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1))
f.write(" final int shift = b << %d;\n" %log_2)
else:
f.write(" final int o = index / %d;\n" %(64 / bpv))
f.write(" final int b = index %% %d;\n" %(64 / bpv))
f.write(" final int shift = b * %d;\n" %bpv)
f.write(" return (blocks[o] >>> shift) & %dL;\n" %((1 << bpv) - 1))
f.write(" }\n\n")
f.write(" @Override\n")
f.write(" public void set(int index, long value) {\n")
if log_2 is not None:
f.write(" final int o = index >>> %d;\n" %(6 - log_2))
f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1))
f.write(" final int shift = b << %d;\n" %log_2)
else:
f.write(" final int o = index / %d;\n" %(64 / bpv))
f.write(" final int b = index %% %d;\n" %(64 / bpv))
f.write(" final int shift = b * %d;\n" %bpv)
f.write(" blocks[o] = (blocks[o] & ~(%dL << shift)) | (value << shift);\n" % ((1 << bpv) - 1))
f.write(" }\n\n")
f.write(" }\n\n")
f.write(FOOTER)
f.close()

161
fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_PackedThreeBlocks.py

@ -1,161 +0,0 @@
#! /usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
HEADER="""// This file has been automatically generated, DO NOT EDIT
package com.fr.third.org.apache.lucene.util.packed;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.store.DataInput;
import com.fr.third.org.apache.lucene.util.RamUsageEstimator;
import java.io.IOException;
import java.util.Arrays;
"""
TYPES = {8: "byte", 16: "short"}
MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""}
CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""}
if __name__ == '__main__':
for bpv in TYPES.keys():
type
f = open("Packed%dThreeBlocks.java" %bpv, 'w')
f.write(HEADER)
f.write("""/**
* Packs integers into 3 %ss (%d bits per value).
* @lucene.internal
*/\n""" %(TYPES[bpv], bpv*3))
f.write("final class Packed%dThreeBlocks extends PackedInts.MutableImpl {\n" %bpv)
f.write(" final %s[] blocks;\n\n" %TYPES[bpv])
f.write(" public static final int MAX_SIZE = Integer.MAX_VALUE / 3;\n\n")
f.write(" Packed%dThreeBlocks(int valueCount) {\n" %bpv)
f.write(" super(valueCount, %d);\n" %(bpv*3))
f.write(" if (valueCount > MAX_SIZE) {\n")
f.write(" throw new ArrayIndexOutOfBoundsException(\"MAX_SIZE exceeded\");\n")
f.write(" }\n")
f.write(" blocks = new %s[valueCount * 3];\n" %TYPES[bpv])
f.write(" }\n\n")
f.write(" Packed%dThreeBlocks(DataInput in, int valueCount) throws IOException {\n" %bpv)
f.write(" this(valueCount);\n")
f.write(" for (int i = 0; i < 3 * valueCount; ++i) {\n")
f.write(" blocks[i] = in.read%s();\n" %TYPES[bpv].title())
f.write(" }\n")
f.write(" final int mod = blocks.length %% %d;\n" %(64 / bpv))
f.write(" if (mod != 0) {\n")
f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv))
f.write(" in.read%s();\n" %TYPES[bpv].title())
f.write(" }\n")
f.write(" }\n")
f.write(" }\n")
f.write("""
@Override
public long get(int index) {
final int o = index * 3;
return (blocks[o]%s) << %d | (blocks[o+1]%s) << %d | (blocks[o+2]%s);
}
@Override
public int get(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
assert off + len <= arr.length;
final int gets = Math.min(valueCount - index, len);
for (int i = index * 3, end = (index + gets) * 3; i < end; i+=3) {
arr[off++] = (blocks[i]%s) << %d | (blocks[i+1]%s) << %d | (blocks[i+2]%s);
}
return gets;
}
@Override
public void set(int index, long value) {
final int o = index * 3;
blocks[o] = %s(value >>> %d);
blocks[o+1] = %s(value >>> %d);
blocks[o+2] = %svalue;
}
@Override
public int set(int index, long[] arr, int off, int len) {
assert len > 0 : "len must be > 0 (got " + len + ")";
assert index >= 0 && index < valueCount;
assert off + len <= arr.length;
final int sets = Math.min(valueCount - index, len);
for (int i = off, o = index * 3, end = off + sets; i < end; ++i) {
final long value = arr[i];
blocks[o++] = %s(value >>> %d);
blocks[o++] = %s(value >>> %d);
blocks[o++] = %svalue;
}
return sets;
}
@Override
public void fill(int fromIndex, int toIndex, long val) {
final %s block1 = %s(val >>> %d);
final %s block2 = %s(val >>> %d);
final %s block3 = %sval;
for (int i = fromIndex * 3, end = toIndex * 3; i < end; i += 3) {
blocks[i] = block1;
blocks[i+1] = block2;
blocks[i+2] = block3;
}
}
@Override
public void clear() {
Arrays.fill(blocks, %s0);
}
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(blocks);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue
+ ", size=" + size() + ", elements.length=" + blocks.length + ")";
}
}
""" %(MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], CASTS[bpv], 2*bpv, CASTS[bpv], bpv, CASTS[bpv], CASTS[bpv],
2*bpv, CASTS[bpv], bpv, CASTS[bpv], TYPES[bpv], CASTS[bpv], 2*bpv, TYPES[bpv],
CASTS[bpv], bpv, TYPES[bpv], CASTS[bpv], CASTS[bpv]))
f.close()
Loading…
Cancel
Save