You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
141 lines
4.0 KiB
141 lines
4.0 KiB
var util = require('util'), |
|
Match = require ('../match'); |
|
|
|
|
|
/** |
|
* This is a superclass for the individual detectors for |
|
* each of the detectable members of the ISO 2022 family |
|
* of encodings. |
|
*/ |
|
|
|
function ISO_2022() {} |
|
|
|
ISO_2022.prototype.match = function(det) { |
|
|
|
/** |
|
* Matching function shared among the 2022 detectors JP, CN and KR |
|
* Counts up the number of legal an unrecognized escape sequences in |
|
* the sample of text, and computes a score based on the total number & |
|
* the proportion that fit the encoding. |
|
* |
|
* |
|
* @param text the byte buffer containing text to analyse |
|
* @param textLen the size of the text in the byte. |
|
* @param escapeSequences the byte escape sequences to test for. |
|
* @return match quality, in the range of 0-100. |
|
*/ |
|
|
|
var i, j; |
|
var escN; |
|
var hits = 0; |
|
var misses = 0; |
|
var shifts = 0; |
|
var quality; |
|
|
|
// TODO: refactor me |
|
var text = det.fInputBytes; |
|
var textLen = det.fInputLen; |
|
|
|
scanInput: |
|
for (i = 0; i < textLen; i++) { |
|
if (text[i] == 0x1b) { |
|
checkEscapes: |
|
for (escN = 0; escN < this.escapeSequences.length; escN++) { |
|
var seq = this.escapeSequences[escN]; |
|
|
|
if ((textLen - i) < seq.length) |
|
continue checkEscapes; |
|
|
|
for (j = 1; j < seq.length; j++) |
|
if (seq[j] != text[i + j]) |
|
continue checkEscapes; |
|
|
|
|
|
hits++; |
|
i += seq.length - 1; |
|
continue scanInput; |
|
} |
|
|
|
misses++; |
|
} |
|
|
|
// Shift in/out |
|
if (text[i] == 0x0e || text[i] == 0x0f) |
|
shifts++; |
|
|
|
} |
|
|
|
if (hits == 0) |
|
return null; |
|
|
|
// |
|
// Initial quality is based on relative proportion of recongized vs. |
|
// unrecognized escape sequences. |
|
// All good: quality = 100; |
|
// half or less good: quality = 0; |
|
// linear inbetween. |
|
quality = (100 * hits - 100 * misses) / (hits + misses); |
|
|
|
// Back off quality if there were too few escape sequences seen. |
|
// Include shifts in this computation, so that KR does not get penalized |
|
// for having only a single Escape sequence, but many shifts. |
|
if (hits + shifts < 5) |
|
quality -= (5 - (hits + shifts)) * 10; |
|
|
|
return quality <= 0 ? null : new Match(det, this, quality); |
|
}; |
|
|
|
module.exports.ISO_2022_JP = function() { |
|
this.name = function() { |
|
return 'ISO-2022-JP'; |
|
}; |
|
this.escapeSequences = [ |
|
[ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992 |
|
[ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990 |
|
[ 0x1b, 0x24, 0x40 ], // JIS C 6226-1978 |
|
[ 0x1b, 0x24, 0x41 ], // GB 2312-80 |
|
[ 0x1b, 0x24, 0x42 ], // JIS X 208-1983 |
|
[ 0x1b, 0x26, 0x40 ], // JIS X 208 1990, 1997 |
|
[ 0x1b, 0x28, 0x42 ], // ASCII |
|
[ 0x1b, 0x28, 0x48 ], // JIS-Roman |
|
[ 0x1b, 0x28, 0x49 ], // Half-width katakana |
|
[ 0x1b, 0x28, 0x4a ], // JIS-Roman |
|
[ 0x1b, 0x2e, 0x41 ], // ISO 8859-1 |
|
[ 0x1b, 0x2e, 0x46 ] // ISO 8859-7 |
|
]; |
|
}; |
|
util.inherits(module.exports.ISO_2022_JP, ISO_2022); |
|
|
|
|
|
|
|
module.exports.ISO_2022_KR = function() { |
|
this.name = function() { |
|
return 'ISO-2022-KR'; |
|
}; |
|
this.escapeSequences = [ |
|
[ 0x1b, 0x24, 0x29, 0x43 ] |
|
]; |
|
}; |
|
util.inherits(module.exports.ISO_2022_KR, ISO_2022); |
|
|
|
|
|
|
|
module.exports.ISO_2022_CN = function() { |
|
this.name = function() { |
|
return 'ISO-2022-CN'; |
|
}; |
|
this.escapeSequences = [ |
|
[ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80 |
|
[ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1 |
|
[ 0x1b, 0x24, 0x2A, 0x48 ], // CNS 11643-1992 Plane 2 |
|
[ 0x1b, 0x24, 0x29, 0x45 ], // ISO-IR-165 |
|
[ 0x1b, 0x24, 0x2B, 0x49 ], // CNS 11643-1992 Plane 3 |
|
[ 0x1b, 0x24, 0x2B, 0x4A ], // CNS 11643-1992 Plane 4 |
|
[ 0x1b, 0x24, 0x2B, 0x4B ], // CNS 11643-1992 Plane 5 |
|
[ 0x1b, 0x24, 0x2B, 0x4C ], // CNS 11643-1992 Plane 6 |
|
[ 0x1b, 0x24, 0x2B, 0x4D ], // CNS 11643-1992 Plane 7 |
|
[ 0x1b, 0x4e ], // SS2 |
|
[ 0x1b, 0x4f ] // SS3 |
|
]; |
|
}; |
|
util.inherits(module.exports.ISO_2022_CN, ISO_2022);
|
|
|