From 9156f4439f40aca12096839232ecd2e6861e0eb2 Mon Sep 17 00:00:00 2001 From: "Yuan.Wang" Date: Tue, 28 Nov 2023 20:27:25 +0800 Subject: [PATCH] =?UTF-8?q?REPORT-110010=20=E5=88=A0=E9=99=A4lucene?= =?UTF-8?q?=E9=87=8C=E7=9A=84py=E8=85=B3=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../lucene/analysis/charfilter/htmlentity.py | 539 ------------------ .../lucene/util/automaton/UTF32ToUTF8.py | 366 ------------ .../util/automaton/createLevAutomata.py | 500 ---------------- .../lucene/util/packed/gen_BulkOperation.py | 335 ----------- .../apache/lucene/util/packed/gen_Direct.py | 175 ------ .../util/packed/gen_Packed64SingleBlock.py | 291 ---------- .../util/packed/gen_PackedThreeBlocks.py | 161 ------ 7 files changed, 2367 deletions(-) delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/UTF32ToUTF8.py delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/createLevAutomata.py delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_BulkOperation.py delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Direct.py delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Packed64SingleBlock.py delete mode 100644 fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_PackedThreeBlocks.py diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py deleted file mode 100644 index ff9ee6bf3..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py +++ /dev/null @@ -1,539 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - -# A simple python script to generate an HTML entity map and a regex alternation -# for inclusion in HTMLStripCharFilter.jflex. - -def main(): - print get_apache_license() - codes = {} - regex = re.compile(r'\s*= 80: - print output_line - output_line = ' ' - output_line += new_entry - if key in ('quot','copy','gt','lt','reg','amp'): - new_entry = ' | "%s"' % key.upper() - if len(output_line) + len(new_entry) >= 80: - print output_line - output_line = ' ' - output_line += new_entry - print output_line, ')' - - print '%{' - print ' private static final Map upperCaseVariantsAccepted' - print ' = new HashMap();' - print ' static {' - print ' upperCaseVariantsAccepted.put("quot", "QUOT");' - print ' upperCaseVariantsAccepted.put("copy", "COPY");' - print ' upperCaseVariantsAccepted.put("gt", "GT");' - print ' upperCaseVariantsAccepted.put("lt", "LT");' - print ' upperCaseVariantsAccepted.put("reg", "REG");' - print ' upperCaseVariantsAccepted.put("amp", "AMP");' - print ' }' - print ' private static final CharArrayMap entityValues' - print ' = new CharArrayMap(Version.LUCENE_40, %i, false);' % len(keys) - print ' static {' - print ' String[] entities = {' - output_line = ' ' - for key in keys: - new_entry = ' "%s", "%s",' % (key, codes[key]) - if len(output_line) + len(new_entry) >= 80: - print output_line - output_line = ' ' - output_line += new_entry - print output_line[:-1] - print ' };' - print ' for (int i = 0 ; i < entities.length ; i += 2) {' - print ' Character value = entities[i + 1].charAt(0);' - print ' entityValues.put(entities[i], value);' - print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);' - print ' if (upperCaseVariant != null) {' - print ' entityValues.put(upperCaseVariant, value);' - print ' }' - print ' }' - print " }" - print "%}" - -def get_entity_text(): -# The text below is taken verbatim from -# : - text = r""" -F.1. XHTML Character Entities - -XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section. -F.1.1. XHTML Latin 1 Character Entities - -You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -F.1.2. XHTML Special Characters - -You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -F.1.3. XHTML Mathematical, Greek, and Symbolic Characters - -You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -""" - return text - -def get_apache_license(): - license = r"""/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -""" - return license - -main() diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/UTF32ToUTF8.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/UTF32ToUTF8.py deleted file mode 100644 index 7ad381c1c..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/UTF32ToUTF8.py +++ /dev/null @@ -1,366 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import types -import os -import sys -import random - -MAX_UNICODE = 0x10FFFF - -# TODO -# - could be more minimal -# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges -# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does - -# MASKS[0] is bottom 1-bit -# MASKS[1] is bottom 2-bits -# ... - -utf8Ranges = [(0, 127), - (128, 2047), - (2048, 65535), - (65536, 1114111)] - -typeToColor = {'startend': 'purple', - 'start': 'blue', - 'end': 'red'} - -class FSA: - - def __init__(self): - # maps fromNode -> (startUTF8, endUTF8, endNode) - self.states = {} - self.nodeUpto = 0 - - def run(self, bytes): - state = self.start - for b in bytes: - found = False - oldState = state - for label, s, e, n in self.states[state][1:]: - if b >= s and b <= e: - if found: - raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b)) - state = n - found = True - if not found: - return -1 - - return state - - def addEdge(self, n1, n2, v1, v2, label): - """ - Adds edge from n1-n2, utf8 byte range v1-v2. - """ - assert n1 in self.states - assert type(v1) is types.IntType - assert type(v2) is types.IntType - self.states[n1].append((label, v1, v2, n2)) - - def addNode(self, label=None): - try: - self.states[self.nodeUpto] = [label] - return self.nodeUpto - finally: - self.nodeUpto += 1 - - def toDOT(self, label): - __l = [] - w = __l.append - endNode = startNode = None - for id, details in self.states.items(): - name = details[0] - if name == 'end': - endNode = id - elif name == 'start': - startNode = id - - w('digraph %s {' % label) - w(' rankdir=LR;') - w(' size="8,5";') - w(' node [color=white label=""]; Ns;') - - w(' node [color=black];') - w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode)) - w(' node [shape=circle];') - - w(' N%s [label="%s"];' % (startNode, startNode)) - w(' Ns -> N%s;' % startNode) - for id, details in self.states.items(): - edges = details[1:] - w(' N%s [label="%s"];' % (id, id)) - for type, s, e, dest in edges: - c = typeToColor.get(type, 'black') - if type == 'all*': - # special case -- matches any utf8 byte at this point - label = '*' - elif s == e: - label = '%s' % binary(s) - else: - label = '%s-%s' % (binary(s), binary(e)) - w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c)) - if name == 'end': - endNode = id - elif name == 'start': - startNode = id - w('}') - return '\n'.join(__l) - - def toPNG(self, label, pngOut): - open('tmp.dot', 'wb').write(self.toDOT(label)) - if os.system('dot -Tpng tmp.dot -o %s' % pngOut): - raise RuntimeException('dot failed') - - -MASKS = [] -v = 2 -for i in range(32): - MASKS.append(v-1) - v *= 2 - -def binary(x): - if x == 0: - return '00000000' - - l = [] - while x > 0: - if x & 1 == 1: - l.append('1') - else: - l.append('0') - x = x >> 1 - - # big endian! - l.reverse() - - l2 = [] - while len(l) > 0: - s = ''.join(l[-8:]) - if len(s) < 8: - s = '0'*(8-len(s)) + s - l2.append(s) - del l[-8:] - - return ' '.join(l2) - -def getUTF8Rest(code, numBytes): - l = [] - for i in range(numBytes): - l.append((128 | (code & MASKS[5]), 6)) - code = code >> 6 - l.reverse() - return tuple(l) - -def toUTF8(code): - # code = Unicode code point - assert code >= 0 - assert code <= MAX_UNICODE - - if code < 128: - # 0xxxxxxx - bytes = ((code, 7),) - elif code < 2048: - # 110yyyxx 10xxxxxx - byte1 = (6 << 5) | (code >> 6) - bytes = ((byte1, 5),) + getUTF8Rest(code, 1) - elif code < 65536: - # 1110yyyy 10yyyyxx 10xxxxxx - len = 3 - byte1 = (14 << 4) | (code >> 12) - bytes = ((byte1, 4),) + getUTF8Rest(code, 2) - else: - # 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx - len = 4 - byte1 = (30 << 3) | (code >> 18) - bytes = ((byte1, 3),) + getUTF8Rest(code, 3) - - return bytes - -def all(fsa, startNode, endNode, startCode, endCode, left): - if len(left) == 0: - fsa.addEdge(startNode, endNode, startCode, endCode, 'all') - else: - lastN = fsa.addNode() - fsa.addEdge(startNode, lastN, startCode, endCode, 'all') - while len(left) > 1: - n = fsa.addNode() - fsa.addEdge(lastN, n, 128, 191, 'all*') - left = left[1:] - lastN = n - fsa.addEdge(lastN, endNode, 128, 191, 'all*') - -def start(fsa, startNode, endNode, utf8, doAll): - if len(utf8) == 1: - fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start') - else: - n = fsa.addNode() - fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start') - start(fsa, n, endNode, utf8[1:], True) - end = utf8[0][0] | MASKS[utf8[0][1]-1] - if doAll and utf8[0][0] != end: - all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:]) - -def end(fsa, startNode, endNode, utf8, doAll): - if len(utf8) == 1: - fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end') - else: - if utf8[0][1] == 5: - # special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences): - start = 194 - else: - start = utf8[0][0] & (~MASKS[utf8[0][1]-1]) - if doAll and utf8[0][0] != start: - all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:]) - n = fsa.addNode() - fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end') - end(fsa, n, endNode, utf8[1:], True) - -def build(fsa, - startNode, endNode, - startUTF8, endUTF8): - - # Break into start, middle, end: - if startUTF8[0][0] == endUTF8[0][0]: - # Degen case: lead with the same byte: - if len(startUTF8) == 1 and len(endUTF8) == 1: - fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') - return - else: - assert len(startUTF8) != 1 - assert len(endUTF8) != 1 - n = fsa.addNode() - # single value edge - fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single') - build(fsa, n, endNode, startUTF8[1:], endUTF8[1:]) - elif len(startUTF8) == len(endUTF8): - if len(startUTF8) == 1: - fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') - else: - start(fsa, startNode, endNode, startUTF8, False) - if endUTF8[0][0] - startUTF8[0][0] > 1: - all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:]) - end(fsa, startNode, endNode, endUTF8, False) - else: - # start - start(fsa, startNode, endNode, startUTF8, True) - - # possibly middle - byteCount = 1+len(startUTF8) - while byteCount < len(endUTF8): - s = toUTF8(utf8Ranges[byteCount-1][0]) - e = toUTF8(utf8Ranges[byteCount-1][1]) - all(fsa, startNode, endNode, - s[0][0], - e[0][0], - s[1:]) - byteCount += 1 - - # end - end(fsa, startNode, endNode, endUTF8, True) - -def main(): - - if len(sys.argv) not in (3, 4): - print - print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0] - print - sys.exit(1) - - utf32Start = int(sys.argv[1]) - utf32End = int(sys.argv[2]) - - if utf32Start > utf32End: - print 'ERROR: start must be <= end' - sys.exit(1) - - fsa = FSA() - fsa.start = fsa.addNode('start') - fsa.end = fsa.addNode('end') - - print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)]) - print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)]) - - if len(sys.argv) == 4: - print 't=%s [%s]' % \ - (' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]), - ' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))])) - - build(fsa, fsa.start, fsa.end, - toUTF8(utf32Start), - toUTF8(utf32End)) - - fsa.toPNG('test', '/tmp/outpy.png') - print 'Saved to /tmp/outpy.png...' - - test(fsa, utf32Start, utf32End, 100000); - -def test(fsa, utf32Start, utf32End, count): - - # verify correct ints are accepted - for i in range(count): - r = random.randint(utf32Start, utf32End) - dest = fsa.run([tup[0] for tup in toUTF8(r)]) - if dest != fsa.end: - print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) - return False - - invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1) - if invalidRange >= 0: - # verify invalid ints are not accepted - for i in range(count): - r = random.randint(0, invalidRange-1) - if r >= utf32Start: - r = utf32End + 1 + r - utf32Start - dest = fsa.run([tup[0] for tup in toUTF8(r)]) - if dest != -1: - print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) - return False - - return True - -def stress(): - - print 'Testing...' - - iter = 0 - while True: - if iter % 10 == 0: - print '%s...' % iter - iter += 1 - - v1 = random.randint(0, MAX_UNICODE) - v2 = random.randint(0, MAX_UNICODE) - if v2 < v1: - v1, v2 = v2, v1 - - utf32Start = v1 - utf32End = v2 - - fsa = FSA() - fsa.start = fsa.addNode('start') - fsa.end = fsa.addNode('end') - build(fsa, fsa.start, fsa.end, - toUTF8(utf32Start), - toUTF8(utf32End)) - - if not test(fsa, utf32Start, utf32End, 10000): - print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End) - -if __name__ == '__main__': - if len(sys.argv) > 1: - main() - else: - stress() diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/createLevAutomata.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/createLevAutomata.py deleted file mode 100644 index 2fc41575e..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/automaton/createLevAutomata.py +++ /dev/null @@ -1,500 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note, this file is known to work with rev 120 of the moman -# repository (http://bitbucket.org/jpbarrette/moman/overview) -# -# See also: http://sites.google.com/site/rrettesite/moman - -import math -import os -import sys -#sys.path.insert(0, 'moman/finenight/python') -sys.path.insert(0, '../../../../../../../../build/core/moman/finenight/python') -try: - from possibleStates import genTransitions -except ImportError: - from finenight.possibleStates import genTransitions - -MODE = 'array' -PACKED = True -WORD = 64 -LOG2_WORD = int(math.log(WORD)/math.log(2)) -#MODE = 'switch' - -class LineOutput: - - def __init__(self, indent=''): - self.l = [] - self._indent = self.startIndent = indent - self.inComment = False - - def __call__(self, s, indent=0): - if s.find('}') != -1: - assert self._indent != self.startIndent - self._indent = self._indent[:-2] - - if indent != 0: - indent0 = ' ' * (len(self._indent)/2+indent) - else: - indent0 = self._indent - - if s.find('/*') != -1: - if s.find('*/') == -1: - self.inComment = True - elif s.find('*/') != -1: - self.inComment = True - - if self.inComment: - self.l.append(indent0 + s) - else: - self.l.append(indent0 + s.lstrip()) - - self.inComment = self.inComment and s.find('*/') == -1 - - if s.find('{') != -1: - self._indent += ' ' - - def __str__(self): - if True: - assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \ - (len(self._indent), len(self.startIndent)) - return '\n'.join(self.l) - - def indent(self): - self._indent += ' ' - - def outdent(self): - assert self._indent != self.startIndent - self._indent = self._indent[:-2] - -def charVarNumber(charVar): - """ - Maps binary number (eg [1, 0, 1]) to its decimal value (5). - """ - - p = 1 - sum = 0 - downTo = len(charVar)-1 - while downTo >= 0: - sum += p * int(charVar[downTo]) - p *= 2 - downTo -= 1 - return sum - -def main(): - - if len(sys.argv) != 3: - print - print 'Usage: python -u %s N ' % sys.argv[0] - print - print 'NOTE: the resulting .java file is created in the current working dir!' - print - sys.exit(1) - - n = int(sys.argv[1]) - - transpose = (sys.argv[2] == "True") - - tables = genTransitions(n, transpose) - - stateMap = {} - - # init null state - stateMap['[]'] = -1 - - # init start state - stateMap['[(0, 0)]'] = 0 - - w = LineOutput() - - w('package com.fr.third.org.apache.lucene.util.automaton;') - w('') - w('/*') - w(' * Licensed to the Apache Software Foundation (ASF) under one or more') - w(' * contributor license agreements. See the NOTICE file distributed with') - w(' * this work for additional information regarding copyright ownership.') - w(' * The ASF licenses this file to You under the Apache License, Version 2.0') - w(' * (the "License"); you may not use this file except in compliance with') - w(' * the License. You may obtain a copy of the License at') - w(' *') - w(' * http://www.apache.org/licenses/LICENSE-2.0') - w(' *') - w(' * Unless required by applicable law or agreed to in writing, software') - w(' * distributed under the License is distributed on an "AS IS" BASIS,') - w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.') - w(' * See the License for the specific language governing permissions and') - w(' * limitations under the License.') - w(' */') - w('') - w('// The following code was generated with the moman/finenight pkg') - w('// This package is available under the MIT License, see NOTICE.txt') - w('// for more details.') - w('') - w('import com.fr.third.org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;') - w('') - if transpose: - w('/** Parametric description for generating a Levenshtein automaton of degree %s, ' % n) - w(' with transpositions as primitive edits */') - className = 'Lev%dTParametricDescription' % n - else: - w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n) - className = 'Lev%dParametricDescription' % n - - w('class %s extends ParametricDescription {' % className) - - w('') - w('@Override') - w('int transition(int absState, int position, int vector) {') - - w(' // null absState should never be passed in') - w(' assert absState != -1;') - - w('') - w(' // decode absState -> state, offset') - w(' int state = absState/(w+1);') - w(' int offset = absState%(w+1);') - w(' assert offset >= 0;') - w('') - - machines = [] - - for i, map in enumerate(tables): - if i == 0: - w('if (position == w) {') - elif i == len(tables)-1: - w('} else {') - else: - w('} else if (position == w-%d) {' % i) - - if i != 0 and MODE == 'switch': - w('switch(vector) {') - - l = map.items() - l.sort() - - numCasesPerVector = None - numVectors = len(l) - - if MODE == 'array': - toStateArray = [] - toOffsetIncrArray = [] - - for charVar, states in l: - - # somehow it's a string: - charVar = eval(charVar) - - if i != 0 and MODE == 'switch': - w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar]))) - w.indent() - - l = states.items() - - byFromState = {} - - # first pass to assign states - byAction = {} - for s, (toS, offset) in l: - state = str(s) - - toState = str(toS) - if state not in stateMap: - stateMap[state] = len(stateMap)-1 - if toState not in stateMap: - stateMap[toState] = len(stateMap)-1 - - byFromState[stateMap[state]] = (1+stateMap[toState], offset) - - fromStateDesc = s[1:len(s)-1] - toStateDesc = ', '.join([str(x) for x in toS]) - - tup = (stateMap[toState], toStateDesc, offset) - if tup not in byAction: - byAction[tup] = [] - byAction[tup].append((fromStateDesc, stateMap[state])) - - if numCasesPerVector is None: - numCasesPerVector = len(l) - else: - # we require this to be uniform... empirically it seems to be! - assert numCasesPerVector == len(l) - - if MODE == 'array': - - for s in range(numCasesPerVector): - toState, offsetIncr = byFromState[s] - toStateArray.append(toState) - toOffsetIncrArray.append(offsetIncr) - - else: - - # render switches - w('switch(state) { // %s cases' % len(l)) - - for (toState, toStateDesc, offset), lx in byAction.items(): - for fromStateDesc, fromState in lx: - w('case %s: // %s' % (fromState, fromStateDesc)) - w.indent() - w(' state = %s; // %s' % (toState, toStateDesc)) - if offset > 0: - w(' offset += %s;' % offset) - w('break;') - w.outdent() - - w('}') - if i != 0: - w('break;') - w.outdent() - - if MODE == 'array': - # strangely state can come in wildly out of bounds.... - w(' if (state < %d) {' % numCasesPerVector) - w(' final int loc = vector * %d + state;' % numCasesPerVector) - if PACKED: - w(' offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i)) - w(' state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i)) - else: - w(' offset += offsetIncrs%d[loc];' % i) - w(' state = toStates%d[loc]-1;' % i) - w(' }') - elif i != 0: - w('}') - - machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors)) - - # ends switch statement for machine - w('}') - - w('') - - w(' if (state == -1) {') - w(' // null state') - w(' return -1;') - w(' } else {') - w(' // translate back to abs') - w(' return state*(w+1)+offset;') - w(' }') - - # ends transition method - w('}') - - subs = [] - if MODE == 'array': - w.indent() - for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines): - w('') - w.outdent() - w('// %d vectors; %d states per vector; array length = %d' % \ - (numVectors, numCasesPerVector, numVectors*numCasesPerVector)) - w.indent() - if PACKED: - # pack in python - l, nbits = pack(toStateArray) - subs.append(('NBITSSTATES%d' % i, str(nbits))) - w(' private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \ - (i, nbits, renderList([hex(long(x)) for x in l]))) - - l, nbits = pack(toOffsetIncrsArray) - subs.append(('NBITSOFFSET%d' % i, str(nbits))) - w(' private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \ - (i, nbits, renderList([hex(long(x)) for x in l]))) - else: - w(' private final static int[] toStates%d = new int[] %s;' % \ - (i, renderList([str(x) for x in toStateArray]))) - w(' private final static int[] offsetIncrs%d = new int[] %s;' % \ - (i, renderList([str(x) for x in toStateArray]))) - w.outdent() - - stateMap2 = dict([[v,k] for k,v in stateMap.items()]) - w('') - w('// state map') - sum = 0 - minErrors = [] - for i in xrange(len(stateMap2)-1): - w('// %s -> %s' % (i, stateMap2[i])) - # we replace t-notation as its not relevant here - st = stateMap2[i].replace('t', '') - - v = eval(st) - minError = min([-i+e for i, e in v]) - c = len(v) - sum += c - minErrors.append(minError) - w('') - - w.indent() - #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors])) - - w.outdent() - - w('') - w(' public %s(int w) {' % className) - w(' super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1) - w(' }') - - if 0: - w('') - w('@Override') - w('public int size() { // this can now move up?') - w(' return %d*(w+1);' % (len(stateMap2)-1)) - w('}') - - w('') - w('@Override') - w('public int getPosition(int absState) { // this can now move up?') - w(' return absState % (w+1);') - w('}') - - w('') - w('@Override') - w('public boolean isAccept(int absState) { // this can now move up?') - w(' // decode absState -> state, offset') - w(' int state = absState/(w+1);') - w(' if (true || state < minErrors.length) {') - w(' int offset = absState%(w+1);') - w(' assert offset >= 0;') - w(' return w - offset + minErrors[state] <= %d;' % n) - w(' } else {') - w(' return false;') - w(' }') - w('}') - - if MODE == 'array' and PACKED: - - # we moved into super class - if False: - w('') - - v = 2 - l = [] - for i in range(63): - l.append(hex(v-1)) - v *= 2 - - w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1) - w('') - - # unpack in java - w('private int unpack(long[] data, int index, int bitsPerValue) {') - w(' final long bitLoc = bitsPerValue * index;') - w(' final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD) - w(' final int bitStart = (int) (bitLoc & %d);' % (WORD-1)) - w(' //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);') - w(' if (bitStart + bitsPerValue <= %d) {' % WORD) - w(' // not split') - w(' return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);') - w(' } else {') - w(' // split') - w(' final int part = %d-bitStart;' % WORD) - w(' return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +') - w(' ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1) - w(' }') - w('}') - - # class - w('}') - w('') - - fileOut = '%s.java' % className - - s = str(w) - for sub, repl in subs: - s = s.replace(sub, repl) - - open(fileOut, 'wb').write(s) - - print 'Wrote %s [%d lines; %.1f KB]' % \ - (fileOut, len(w.l), os.path.getsize(fileOut)/1024.) - -def renderList(l): - lx = [' '] - for i in xrange(len(l)): - if i > 0: - lx.append(',') - if i % 4 == 0: - lx.append('\n ') - lx.append(l[i]) - return '{\n%s\n }' % ''.join(lx) - -MASKS = [] -v = 2 -for i in xrange(63): - MASKS.append(v-1) - v *= 2 - -# packs into longs; returns long[], numBits -def pack(l): - maxV = max(l) - bitsPerValue = max(1, int(math.ceil(math.log(maxV+1)/math.log(2.0)))) - - bitsLeft = WORD - pendingValue = 0 - - packed = [] - for i in xrange(len(l)): - v = l[i] - if pendingValue > 0: - bitsUsed = math.ceil(math.log(pendingValue)/math.log(2.0)) - assert bitsUsed <= (WORD-bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD-bitsLeft, bitsUsed) - - if bitsLeft >= bitsPerValue: - pendingValue += v << (WORD-bitsLeft) - bitsLeft -= bitsPerValue - if bitsLeft == 0: - packed.append(pendingValue) - bitsLeft = WORD - pendingValue = 0 - else: - # split - - # bottom bitsLeft go in current word: - pendingValue += (v & MASKS[bitsLeft-1]) << (WORD-bitsLeft) - packed.append(pendingValue) - - pendingValue = v >> bitsLeft - bitsLeft = WORD - (bitsPerValue-bitsLeft) - - if bitsLeft < WORD: - packed.append(pendingValue) - - # verify(l, packed, bitsPerValue) - - return packed, bitsPerValue - -def verify(data, packedData, bitsPerValue): - for i in range(len(data)): - assert data[i] == unpack(packedData, i, bitsPerValue) - -def unpack(data, index, bitsPerValue): - bitLoc = bitsPerValue * index - dataLoc = int(bitLoc >> LOG2_WORD) - bitStart = int(bitLoc & (WORD-1)) - if bitStart + bitsPerValue <= WORD: - # not split - return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1])) - else: - # split - part = WORD-bitStart; - return int((((data[dataLoc] >> bitStart) & MASKS[part-1]) + - ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part))) - -if __name__ == '__main__': - if not __debug__: - print - print 'ERROR: please run without -O' - print - sys.exit(1) - main() diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_BulkOperation.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_BulkOperation.py deleted file mode 100644 index 56c8c9d00..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_BulkOperation.py +++ /dev/null @@ -1,335 +0,0 @@ -#! /usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from fractions import gcd - -"""Code generation for bulk operations""" - -MAX_SPECIALIZED_BITS_PER_VALUE = 24; -PACKED_64_SINGLE_BLOCK_BPV = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] -OUTPUT_FILE = "BulkOperation.java" -HEADER = """// This file has been automatically generated, DO NOT EDIT - -package com.fr.third.org.apache.lucene.util.packed; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -""" - -FOOTER=""" - protected int writeLong(long block, byte[] blocks, int blocksOffset) { - for (int j = 1; j <= 8; ++j) { - blocks[blocksOffset++] = (byte) (block >>> (64 - (j << 3))); - } - return blocksOffset; - } - - /** - * For every number of bits per value, there is a minimum number of - * blocks (b) / values (v) you need to write in order to reach the next block - * boundary: - * - 16 bits per value -> b=1, v=4 - * - 24 bits per value -> b=3, v=8 - * - 50 bits per value -> b=25, v=32 - * - 63 bits per value -> b=63, v=64 - * - ... - * - * A bulk read consists in copying iterations*v values that are - * contained in iterations*b blocks into a long[] - * (higher values of iterations are likely to yield a better - * throughput) => this requires n * (b + v) longs in memory. - * - * This method computes iterations as - * ramBudget / (8 * (b + v)) (since a long is 8 bytes). - */ - public final int computeIterations(int valueCount, int ramBudget) { - final int iterations = (ramBudget >>> 3) / (blockCount() + valueCount()); - if (iterations == 0) { - // at least 1 - return 1; - } else if ((iterations - 1) * blockCount() >= valueCount) { - // don't allocate for more than the size of the reader - return (int) Math.ceil((double) valueCount / valueCount()); - } else { - return iterations; - } - } -} -""" - -def is_power_of_two(n): - return n & (n - 1) == 0 - -def casts(typ): - cast_start = "(%s) (" %typ - cast_end = ")" - if typ == "long": - cast_start = "" - cast_end = "" - return cast_start, cast_end - -def hexNoLSuffix(n): - # On 32 bit Python values > (1 << 31)-1 will have L appended by hex function: - s = hex(n) - if s.endswith('L'): - s = s[:-1] - return s - -def masks(bits): - if bits == 64: - return "", "" - return "(", " & %sL)" %(hexNoLSuffix((1 << bits) - 1)) - -def get_type(bits): - if bits == 8: - return "byte" - elif bits == 16: - return "short" - elif bits == 32: - return "int" - elif bits == 64: - return "long" - else: - assert False - -def block_value_count(bpv, bits=64): - blocks = bpv - values = blocks * bits / bpv - while blocks % 2 == 0 and values % 2 == 0: - blocks /= 2 - values /= 2 - assert values * bpv == bits * blocks, "%d values, %d blocks, %d bits per value" %(values, blocks, bpv) - return (blocks, values) - -def packed64(bpv, f): - blocks, values = block_value_count(bpv) - mask = (1 << bpv) - 1 - - f.write("\n") - f.write(" public BulkOperationPacked%d() {\n" %bpv) - f.write(" super(%d);\n" %bpv) - f.write(" assert blockCount() == %d;\n" %blocks) - f.write(" assert valueCount() == %d;\n" %values) - f.write(" }\n\n") - - if bpv == 64: - f.write(""" @Override - public void decode(long[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) { - System.arraycopy(blocks, blocksOffset, values, valuesOffset, valueCount() * iterations); - } - - @Override - public void decode(long[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) { - throw new UnsupportedOperationException(); - } - - @Override - public void decode(byte[] blocks, int blocksOffset, int[] values, int valuesOffset, int iterations) { - throw new UnsupportedOperationException(); - } - - @Override - public void decode(byte[] blocks, int blocksOffset, long[] values, int valuesOffset, int iterations) { - LongBuffer.wrap(values, valuesOffset, iterations * valueCount()).put(ByteBuffer.wrap(blocks, blocksOffset, 8 * iterations * blockCount()).asLongBuffer()); - } -""") - else: - p64_decode(bpv, f, 32) - p64_decode(bpv, f, 64) - -def p64_decode(bpv, f, bits): - blocks, values = block_value_count(bpv) - typ = get_type(bits) - cast_start, cast_end = casts(typ) - - f.write(" @Override\n") - f.write(" public void decode(long[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ) - if bits < bpv: - f.write(" throw new UnsupportedOperationException();\n") - else: - f.write(" for (int i = 0; i < iterations; ++i) {\n") - mask = (1 << bpv) - 1 - - if is_power_of_two(bpv): - f.write(" final long block = blocks[blocksOffset++];\n") - f.write(" for (int shift = %d; shift >= 0; shift -= %d) {\n" %(64 - bpv, bpv)) - f.write(" values[valuesOffset++] = %s(block >>> shift) & %d%s;\n" %(cast_start, mask, cast_end)) - f.write(" }\n") - else: - for i in xrange(0, values): - block_offset = i * bpv / 64 - bit_offset = (i * bpv) % 64 - if bit_offset == 0: - # start of block - f.write(" final long block%d = blocks[blocksOffset++];\n" %block_offset); - f.write(" values[valuesOffset++] = %sblock%d >>> %d%s;\n" %(cast_start, block_offset, 64 - bpv, cast_end)) - elif bit_offset + bpv == 64: - # end of block - f.write(" values[valuesOffset++] = %sblock%d & %dL%s;\n" %(cast_start, block_offset, mask, cast_end)) - elif bit_offset + bpv < 64: - # middle of block - f.write(" values[valuesOffset++] = %s(block%d >>> %d) & %dL%s;\n" %(cast_start, block_offset, 64 - bit_offset - bpv, mask, cast_end)) - else: - # value spans across 2 blocks - mask1 = (1 << (64 - bit_offset)) -1 - shift1 = bit_offset + bpv - 64 - shift2 = 64 - shift1 - f.write(" final long block%d = blocks[blocksOffset++];\n" %(block_offset + 1)); - f.write(" values[valuesOffset++] = %s((block%d & %dL) << %d) | (block%d >>> %d)%s;\n" %(cast_start, block_offset, mask1, shift1, block_offset + 1, shift2, cast_end)) - f.write(" }\n") - f.write(" }\n\n") - - byte_blocks, byte_values = block_value_count(bpv, 8) - - f.write(" @Override\n") - f.write(" public void decode(byte[] blocks, int blocksOffset, %s[] values, int valuesOffset, int iterations) {\n" %typ) - if bits < bpv: - f.write(" throw new UnsupportedOperationException();\n") - else: - - if is_power_of_two(bpv) and bpv < 8: - f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n") - f.write(" final byte block = blocks[blocksOffset++];\n") - for shift in xrange(8 - bpv, 0, -bpv): - f.write(" values[valuesOffset++] = (block >>> %d) & %d;\n" %(shift, mask)) - f.write(" values[valuesOffset++] = block & %d;\n" %mask) - f.write(" }\n") - elif bpv == 8: - f.write(" for (int j = 0; j < 8 * iterations; ++j) {\n") - f.write(" values[valuesOffset++] = blocks[blocksOffset++] & 0xFF;\n") - f.write(" }\n") - elif is_power_of_two(bpv) and bpv > 8: - f.write(" for (int j = 0; j < %d * iterations; ++j) {\n" %(64 / bpv)) - m = bits <= 32 and "0xFF" or "0xFFL" - f.write(" values[valuesOffset++] =") - for i in xrange(bpv / 8 - 1): - f.write(" ((blocks[blocksOffset++] & %s) << %d) |" %(m, bpv - 8)) - f.write(" (blocks[blocksOffset++] & %s);\n" %m) - f.write(" }\n") - else: - f.write(" for (int i = 0; i < 8 * iterations; ++i) {\n") - for i in xrange(0, byte_values): - byte_start = i * bpv / 8 - bit_start = (i * bpv) % 8 - byte_end = ((i + 1) * bpv - 1) / 8 - bit_end = ((i + 1) * bpv - 1) % 8 - shift = lambda b: 8 * (byte_end - b - 1) + 1 + bit_end - if bit_start == 0: - f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, byte_start)) - for b in xrange(byte_start + 1, byte_end + 1): - f.write(" final %s byte%d = blocks[blocksOffset++] & 0xFF;\n" %(typ, b)) - f.write(" values[valuesOffset++] =") - if byte_start == byte_end: - if bit_start == 0: - if bit_end == 7: - f.write(" byte%d" %byte_start) - else: - f.write(" byte%d >>> %d" %(byte_start, 7 - bit_end)) - else: - if bit_end == 7: - f.write(" byte%d & %d" %(byte_start, 2 ** (8 - bit_start) - 1)) - else: - f.write(" (byte%d >>> %d) & %d" %(byte_start, 7 - bit_end, 2 ** (bit_end - bit_start + 1) - 1)) - else: - if bit_start == 0: - f.write(" (byte%d << %d)" %(byte_start, shift(byte_start))) - else: - f.write(" ((byte%d & %d) << %d)" %(byte_start, 2 ** (8 - bit_start) - 1, shift(byte_start))) - for b in xrange(byte_start + 1, byte_end): - f.write(" | (byte%d << %d)" %(b, shift(b))) - if bit_end == 7: - f.write(" | byte%d" %byte_end) - else: - f.write(" | (byte%d >>> %d)" %(byte_end, 7 - bit_end)) - f.write(";\n") - f.write(" }\n") - f.write(" }\n\n") - -if __name__ == '__main__': - f = open(OUTPUT_FILE, 'w') - f.write(HEADER) - f.write('\n') - f.write('''/** - * Efficient sequential read/write of packed integers. - */\n''') - - f.write('abstract class BulkOperation implements PackedInts.Decoder, PackedInts.Encoder {\n') - f.write(' private static final BulkOperation[] packedBulkOps = new BulkOperation[] {\n') - - for bpv in xrange(1, 65): - if bpv > MAX_SPECIALIZED_BITS_PER_VALUE: - f.write(' new BulkOperationPacked(%d),\n' % bpv) - continue - f2 = open('BulkOperationPacked%d.java' % bpv, 'w') - f2.write(HEADER) - if bpv == 64: - f2.write('import java.nio.LongBuffer;\n') - f2.write('import java.nio.ByteBuffer;\n') - f2.write('\n') - f2.write('''/** - * Efficient sequential read/write of packed integers. - */\n''') - f2.write('final class BulkOperationPacked%d extends BulkOperationPacked {\n' % bpv) - packed64(bpv, f2) - f2.write('}\n') - f2.close() - f.write(' new BulkOperationPacked%d(),\n' % bpv) - - f.write(' };\n') - f.write('\n') - - f.write(' // NOTE: this is sparse (some entries are null):\n') - f.write(' private static final BulkOperation[] packedSingleBlockBulkOps = new BulkOperation[] {\n') - for bpv in xrange(1, max(PACKED_64_SINGLE_BLOCK_BPV)+1): - if bpv in PACKED_64_SINGLE_BLOCK_BPV: - f.write(' new BulkOperationPackedSingleBlock(%d),\n' % bpv) - else: - f.write(' null,\n') - f.write(' };\n') - f.write('\n') - - f.write("\n") - f.write(" public static BulkOperation of(PackedInts.Format format, int bitsPerValue) {\n") - f.write(" switch (format) {\n") - - f.write(" case PACKED:\n") - f.write(" assert packedBulkOps[bitsPerValue - 1] != null;\n") - f.write(" return packedBulkOps[bitsPerValue - 1];\n") - f.write(" case PACKED_SINGLE_BLOCK:\n") - f.write(" assert packedSingleBlockBulkOps[bitsPerValue - 1] != null;\n") - f.write(" return packedSingleBlockBulkOps[bitsPerValue - 1];\n") - f.write(" default:\n") - f.write(" throw new AssertionError();\n") - f.write(" }\n") - f.write(" }\n") - f.write(FOOTER) - f.close() diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Direct.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Direct.py deleted file mode 100644 index bb5dd657c..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Direct.py +++ /dev/null @@ -1,175 +0,0 @@ -#! /usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -HEADER="""// This file has been automatically generated, DO NOT EDIT - -package com.fr.third.org.apache.lucene.util.packed; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import com.fr.third.org.apache.lucene.store.DataInput; -import com.fr.third.org.apache.lucene.util.RamUsageEstimator; - -import java.io.IOException; -import java.util.Arrays; - -""" - -TYPES = {8: "byte", 16: "short", 32: "int", 64: "long"} -MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""} -CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""} - -if __name__ == '__main__': - for bpv in TYPES.keys(): - type - f = open("Direct%d.java" %bpv, 'w') - f.write(HEADER) - f.write("""/** - * Direct wrapping of %d-bits values to a backing array. - * @lucene.internal - */\n""" %bpv) - f.write("final class Direct%d extends PackedInts.MutableImpl {\n" %bpv) - f.write(" final %s[] values;\n\n" %TYPES[bpv]) - - f.write(" Direct%d(int valueCount) {\n" %bpv) - f.write(" super(valueCount, %d);\n" %bpv) - f.write(" values = new %s[valueCount];\n" %TYPES[bpv]) - f.write(" }\n\n") - - f.write(" Direct%d(DataInput in, int valueCount) throws IOException {\n" %bpv) - f.write(" this(valueCount);\n") - f.write(" for (int i = 0; i < valueCount; ++i) {\n") - f.write(" values[i] = in.read%s();\n" %TYPES[bpv].title()) - f.write(" }\n") - if bpv != 64: - f.write(" final int mod = valueCount %% %d;\n" %(64 / bpv)) - f.write(" if (mod != 0) {\n") - f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv)) - f.write(" in.read%s();\n" %TYPES[bpv].title()) - f.write(" }\n") - f.write(" }\n") - f.write(" }\n") - - f.write(""" - @Override - public long get(final int index) { - return values[index]%s; - } - - public void set(final int index, final long value) { - values[index] = %s(value); - } - - public long ramBytesUsed() { - return RamUsageEstimator.sizeOf(values); - } - - public void clear() { - Arrays.fill(values, %s0L); - } - - @Override - public Object getArray() { - return values; - } - - @Override - public boolean hasArray() { - return true; - } -""" %(MASKS[bpv], CASTS[bpv], CASTS[bpv])) - - if bpv == 64: - f.write(""" - @Override - public int get(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - assert off + len <= arr.length; - - final int gets = Math.min(valueCount - index, len); - System.arraycopy(values, index, arr, off, gets); - return gets; - } - - public int set(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - assert off + len <= arr.length; - - final int sets = Math.min(valueCount - index, len); - System.arraycopy(arr, off, values, index, sets); - return sets; - } - - @Override - public void fill(int fromIndex, int toIndex, long val) { - Arrays.fill(values, fromIndex, toIndex, val); - } -""") - else: - f.write(""" - @Override - public int get(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - assert off + len <= arr.length; - - final int gets = Math.min(valueCount - index, len); - for (int i = index, o = off, end = index + gets; i < end; ++i, ++o) { - arr[o] = values[i]%s; - } - return gets; - } - - public int set(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - assert off + len <= arr.length; - - final int sets = Math.min(valueCount - index, len); - for (int i = index, o = off, end = index + sets; i < end; ++i, ++o) { - values[i] = %sarr[o]; - } - return sets; - } - - @Override - public void fill(int fromIndex, int toIndex, long val) { - assert val == (val%s); - Arrays.fill(values, fromIndex, toIndex, %sval); - } -""" %(MASKS[bpv], CASTS[bpv], MASKS[bpv], CASTS[bpv])) - - f.write("}\n") - - f.close() diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Packed64SingleBlock.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Packed64SingleBlock.py deleted file mode 100644 index 77d8e3e33..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_Packed64SingleBlock.py +++ /dev/null @@ -1,291 +0,0 @@ -#! /usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -SUPPORTED_BITS_PER_VALUE = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 21, 32] - -HEADER="""// This file has been automatically generated, DO NOT EDIT - -package com.fr.third.org.apache.lucene.util.packed; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with this - * work for additional information regarding copyright ownership. The ASF - * licenses this file to You under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -import java.io.IOException; -import java.util.Arrays; - -import com.fr.third.org.apache.lucene.store.DataInput; -import com.fr.third.org.apache.lucene.util.RamUsageEstimator; - -/** - * This class is similar to {@link Packed64} except that it trades space for - * speed by ensuring that a single block needs to be read/written in order to - * read/write a value. - */ -abstract class Packed64SingleBlock extends PackedInts.MutableImpl { - - public static final int MAX_SUPPORTED_BITS_PER_VALUE = %d; - private static final int[] SUPPORTED_BITS_PER_VALUE = new int[] {%s}; - - public static boolean isSupported(int bitsPerValue) { - return Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) >= 0; - } - - private static int requiredCapacity(int valueCount, int valuesPerBlock) { - return valueCount / valuesPerBlock - + (valueCount %% valuesPerBlock == 0 ? 0 : 1); - } - - final long[] blocks; - - Packed64SingleBlock(int valueCount, int bitsPerValue) { - super(valueCount, bitsPerValue); - assert isSupported(bitsPerValue); - final int valuesPerBlock = 64 / bitsPerValue; - blocks = new long[requiredCapacity(valueCount, valuesPerBlock)]; - } - - @Override - public void clear() { - Arrays.fill(blocks, 0L); - } - - public long ramBytesUsed() { - return RamUsageEstimator.sizeOf(blocks); - } - - @Override - public int get(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - len = Math.min(len, valueCount - index); - assert off + len <= arr.length; - - final int originalIndex = index; - - // go to the next block boundary - final int valuesPerBlock = 64 / bitsPerValue; - final int offsetInBlock = index %% valuesPerBlock; - if (offsetInBlock != 0) { - for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) { - arr[off++] = get(index++); - --len; - } - if (len == 0) { - return index - originalIndex; - } - } - - // bulk get - assert index %% valuesPerBlock == 0; - final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); - assert decoder.blockCount() == 1; - assert decoder.valueCount() == valuesPerBlock; - final int blockIndex = index / valuesPerBlock; - final int nblocks = (index + len) / valuesPerBlock - blockIndex; - decoder.decode(blocks, blockIndex, arr, off, nblocks); - final int diff = nblocks * valuesPerBlock; - index += diff; len -= diff; - - if (index > originalIndex) { - // stay at the block boundary - return index - originalIndex; - } else { - // no progress so far => already at a block boundary but no full block to - // get - assert index == originalIndex; - return super.get(index, arr, off, len); - } - } - - @Override - public int set(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - len = Math.min(len, valueCount - index); - assert off + len <= arr.length; - - final int originalIndex = index; - - // go to the next block boundary - final int valuesPerBlock = 64 / bitsPerValue; - final int offsetInBlock = index %% valuesPerBlock; - if (offsetInBlock != 0) { - for (int i = offsetInBlock; i < valuesPerBlock && len > 0; ++i) { - set(index++, arr[off++]); - --len; - } - if (len == 0) { - return index - originalIndex; - } - } - - // bulk set - assert index %% valuesPerBlock == 0; - final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue); - assert op.blockCount() == 1; - assert op.valueCount() == valuesPerBlock; - final int blockIndex = index / valuesPerBlock; - final int nblocks = (index + len) / valuesPerBlock - blockIndex; - op.encode(arr, off, blocks, blockIndex, nblocks); - final int diff = nblocks * valuesPerBlock; - index += diff; len -= diff; - - if (index > originalIndex) { - // stay at the block boundary - return index - originalIndex; - } else { - // no progress so far => already at a block boundary but no full block to - // set - assert index == originalIndex; - return super.set(index, arr, off, len); - } - } - - @Override - public void fill(int fromIndex, int toIndex, long val) { - assert fromIndex >= 0; - assert fromIndex <= toIndex; - assert PackedInts.bitsRequired(val) <= bitsPerValue; - - final int valuesPerBlock = 64 / bitsPerValue; - if (toIndex - fromIndex <= valuesPerBlock << 1) { - // there needs to be at least one full block to set for the block - // approach to be worth trying - super.fill(fromIndex, toIndex, val); - return; - } - - // set values naively until the next block start - int fromOffsetInBlock = fromIndex %% valuesPerBlock; - if (fromOffsetInBlock != 0) { - for (int i = fromOffsetInBlock; i < valuesPerBlock; ++i) { - set(fromIndex++, val); - } - assert fromIndex %% valuesPerBlock == 0; - } - - // bulk set of the inner blocks - final int fromBlock = fromIndex / valuesPerBlock; - final int toBlock = toIndex / valuesPerBlock; - assert fromBlock * valuesPerBlock == fromIndex; - - long blockValue = 0L; - for (int i = 0; i < valuesPerBlock; ++i) { - blockValue = blockValue | (val << (i * bitsPerValue)); - } - Arrays.fill(blocks, fromBlock, toBlock, blockValue); - - // fill the gap - for (int i = valuesPerBlock * toBlock; i < toIndex; ++i) { - set(i, val); - } - } - - @Override - protected PackedInts.Format getFormat() { - return PackedInts.Format.PACKED_SINGLE_BLOCK; - } - - @Override - public String toString() { - return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue - + ", size=" + size() + ", elements.length=" + blocks.length + ")"; - } - - public static Packed64SingleBlock create(DataInput in, - int valueCount, int bitsPerValue) throws IOException { - Packed64SingleBlock reader = create(valueCount, bitsPerValue); - for (int i = 0; i < reader.blocks.length; ++i) { - reader.blocks[i] = in.readLong(); - } - return reader; - } - -""" %(SUPPORTED_BITS_PER_VALUE[-1], ", ".join(map(str, SUPPORTED_BITS_PER_VALUE))) - -FOOTER = "}" - -if __name__ == '__main__': - - f = open("Packed64SingleBlock.java", 'w') - f.write(HEADER) - f.write(" public static Packed64SingleBlock create(int valueCount, int bitsPerValue) {\n") - f.write(" switch (bitsPerValue) {\n") - for bpv in SUPPORTED_BITS_PER_VALUE: - f.write(" case %d:\n" %bpv) - f.write(" return new Packed64SingleBlock%d(valueCount);\n" %bpv) - f.write(" default:\n") - f.write(" throw new IllegalArgumentException(\"Unsupported number of bits per value: \" + %d);\n" %bpv) - f.write(" }\n") - f.write(" }\n\n") - - for bpv in SUPPORTED_BITS_PER_VALUE: - log_2 = 0 - while (1 << log_2) < bpv: - log_2 = log_2 + 1 - if (1 << log_2) != bpv: - log_2 = None - - f.write(" static class Packed64SingleBlock%d extends Packed64SingleBlock {\n\n" %bpv) - - f.write(" Packed64SingleBlock%d(int valueCount) {\n" %bpv) - f.write(" super(valueCount, %d);\n" %bpv) - f.write(" }\n\n") - - f.write(" @Override\n") - f.write(" public long get(int index) {\n") - if log_2 is not None: - f.write(" final int o = index >>> %d;\n" %(6 - log_2)) - f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1)) - f.write(" final int shift = b << %d;\n" %log_2) - else: - f.write(" final int o = index / %d;\n" %(64 / bpv)) - f.write(" final int b = index %% %d;\n" %(64 / bpv)) - f.write(" final int shift = b * %d;\n" %bpv) - f.write(" return (blocks[o] >>> shift) & %dL;\n" %((1 << bpv) - 1)) - f.write(" }\n\n") - - f.write(" @Override\n") - f.write(" public void set(int index, long value) {\n") - if log_2 is not None: - f.write(" final int o = index >>> %d;\n" %(6 - log_2)) - f.write(" final int b = index & %d;\n" %((1 << (6 - log_2)) - 1)) - f.write(" final int shift = b << %d;\n" %log_2) - else: - f.write(" final int o = index / %d;\n" %(64 / bpv)) - f.write(" final int b = index %% %d;\n" %(64 / bpv)) - f.write(" final int shift = b * %d;\n" %bpv) - f.write(" blocks[o] = (blocks[o] & ~(%dL << shift)) | (value << shift);\n" % ((1 << bpv) - 1)) - f.write(" }\n\n") - f.write(" }\n\n") - - f.write(FOOTER) - f.close() diff --git a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_PackedThreeBlocks.py b/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_PackedThreeBlocks.py deleted file mode 100644 index 18c670f69..000000000 --- a/fine-lucene/src/main/java/com/fr/third/org/apache/lucene/util/packed/gen_PackedThreeBlocks.py +++ /dev/null @@ -1,161 +0,0 @@ -#! /usr/bin/env python - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -HEADER="""// This file has been automatically generated, DO NOT EDIT - -package com.fr.third.org.apache.lucene.util.packed; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import com.fr.third.org.apache.lucene.store.DataInput; -import com.fr.third.org.apache.lucene.util.RamUsageEstimator; - -import java.io.IOException; -import java.util.Arrays; - -""" - -TYPES = {8: "byte", 16: "short"} -MASKS = {8: " & 0xFFL", 16: " & 0xFFFFL", 32: " & 0xFFFFFFFFL", 64: ""} -CASTS = {8: "(byte) ", 16: "(short) ", 32: "(int) ", 64: ""} - -if __name__ == '__main__': - for bpv in TYPES.keys(): - type - f = open("Packed%dThreeBlocks.java" %bpv, 'w') - f.write(HEADER) - f.write("""/** - * Packs integers into 3 %ss (%d bits per value). - * @lucene.internal - */\n""" %(TYPES[bpv], bpv*3)) - f.write("final class Packed%dThreeBlocks extends PackedInts.MutableImpl {\n" %bpv) - f.write(" final %s[] blocks;\n\n" %TYPES[bpv]) - - f.write(" public static final int MAX_SIZE = Integer.MAX_VALUE / 3;\n\n") - - f.write(" Packed%dThreeBlocks(int valueCount) {\n" %bpv) - f.write(" super(valueCount, %d);\n" %(bpv*3)) - f.write(" if (valueCount > MAX_SIZE) {\n") - f.write(" throw new ArrayIndexOutOfBoundsException(\"MAX_SIZE exceeded\");\n") - f.write(" }\n") - f.write(" blocks = new %s[valueCount * 3];\n" %TYPES[bpv]) - f.write(" }\n\n") - - f.write(" Packed%dThreeBlocks(DataInput in, int valueCount) throws IOException {\n" %bpv) - f.write(" this(valueCount);\n") - f.write(" for (int i = 0; i < 3 * valueCount; ++i) {\n") - f.write(" blocks[i] = in.read%s();\n" %TYPES[bpv].title()) - f.write(" }\n") - f.write(" final int mod = blocks.length %% %d;\n" %(64 / bpv)) - f.write(" if (mod != 0) {\n") - f.write(" for (int i = mod; i < %d; ++i) {\n" %(64 / bpv)) - f.write(" in.read%s();\n" %TYPES[bpv].title()) - f.write(" }\n") - f.write(" }\n") - f.write(" }\n") - - f.write(""" - @Override - public long get(int index) { - final int o = index * 3; - return (blocks[o]%s) << %d | (blocks[o+1]%s) << %d | (blocks[o+2]%s); - } - - @Override - public int get(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - assert off + len <= arr.length; - - final int gets = Math.min(valueCount - index, len); - for (int i = index * 3, end = (index + gets) * 3; i < end; i+=3) { - arr[off++] = (blocks[i]%s) << %d | (blocks[i+1]%s) << %d | (blocks[i+2]%s); - } - return gets; - } - - @Override - public void set(int index, long value) { - final int o = index * 3; - blocks[o] = %s(value >>> %d); - blocks[o+1] = %s(value >>> %d); - blocks[o+2] = %svalue; - } - - @Override - public int set(int index, long[] arr, int off, int len) { - assert len > 0 : "len must be > 0 (got " + len + ")"; - assert index >= 0 && index < valueCount; - assert off + len <= arr.length; - - final int sets = Math.min(valueCount - index, len); - for (int i = off, o = index * 3, end = off + sets; i < end; ++i) { - final long value = arr[i]; - blocks[o++] = %s(value >>> %d); - blocks[o++] = %s(value >>> %d); - blocks[o++] = %svalue; - } - return sets; - } - - @Override - public void fill(int fromIndex, int toIndex, long val) { - final %s block1 = %s(val >>> %d); - final %s block2 = %s(val >>> %d); - final %s block3 = %sval; - for (int i = fromIndex * 3, end = toIndex * 3; i < end; i += 3) { - blocks[i] = block1; - blocks[i+1] = block2; - blocks[i+2] = block3; - } - } - - @Override - public void clear() { - Arrays.fill(blocks, %s0); - } - - public long ramBytesUsed() { - return RamUsageEstimator.sizeOf(blocks); - } - - @Override - public String toString() { - return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue - + ", size=" + size() + ", elements.length=" + blocks.length + ")"; - } -} -""" %(MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], MASKS[bpv], 2*bpv, MASKS[bpv], bpv, MASKS[bpv], CASTS[bpv], 2*bpv, CASTS[bpv], bpv, CASTS[bpv], CASTS[bpv], - 2*bpv, CASTS[bpv], bpv, CASTS[bpv], TYPES[bpv], CASTS[bpv], 2*bpv, TYPES[bpv], - CASTS[bpv], bpv, TYPES[bpv], CASTS[bpv], CASTS[bpv])) - - f.close()