Pure Javascript OCR for more than 100 Languages 📖🎉🖥
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

533 lines
15 KiB

var Tesseract304 = require('tesseract.js-core')
var leveljs = require('level-js')
var db;
if (typeof indexedDB === 'undefined'){
db = { open: function(opts, cb){ cb(true) /*err = true*/ } }
}
else {
db = leveljs('./tessdata')
}
console.log('hallo')
var filesizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922}
var pako = require('pako')
var T;
var tesseractinit = (function createTesseractInstance(memory){
curindex = 0
var Module = Tesseract304({
TOTAL_MEMORY: memory, //must be a multiple of 10 megabytes
TesseractProgress: function(percent){
postMessage({
index: curindex,
'progress': {
'recognized': Math.max(0,(percent-30)/70)
}
})
}//,
// onRuntimeInitialized: function(){
// console.log('wau')
// }
})
var base = new Module.TessBaseAPI()
var loaded_langs = []
var loadLanguage = function(lang, index, cb){ // NodeJS style callback
if(loaded_langs.indexOf(lang) != -1){
cb(null, lang)
}
else{
Module.FS_createPath("/","tessdata",true,true)
var downloadlang = function(shouldcache){
postMessage({
index: index,
'progress': {
'loaded_lang_model': 0,
cached: false,
requesting: true
}
})
var xhr = new XMLHttpRequest();
xhr.open('GET', 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/'+lang+'.traineddata.gz', true);
xhr.responseType = 'arraybuffer';
xhr.onerror = function(){ cb(xhr, null) }
xhr.onprogress = function(e){
postMessage({
index: index,
'progress': {
'loaded_lang_model': e.loaded/filesizes[lang], //this is kinda wrong on safari
cached: false
}
})
}
xhr.onload = function(){
if (xhr.status == 200 || (xhr.status == 0 && xhr.response)) {
postMessage({
index: index,
'progress': 'unzipping_lang_model'
})
var response = new Uint8Array(xhr.response)
while(response[0] == 0x1f && response[1] == 0x8b){
response = pako.ungzip(response)
}
console.log('asdf')
postMessage({
index: index,
'progress': {
'unzipped_lang_model': true,
'lang_model_size': response.length
}
})
Module.FS_createDataFile('tessdata', lang +".traineddata", response, true, false);
if(shouldcache){
db.put(lang, response, function(err){
console.log('cached lang')
})
}
postMessage({
index: index,
'progress': {
'created_virtual_datafile': true,
'cached_file': shouldcache
}
})
loaded_langs.push(lang)
cb(null, lang)
} else cb(xhr, null);
}
xhr.send(null)
}
db.open({compression: false},function(err){
// err = true
if (err) {
downloadlang(false)
}
else {
db.get(lang, function (err, value) {
// err = true
if (err) {
downloadlang(true)
}
else {
while(value[0] == 0x1f && value[1] == 0x8b){
value = pako.ungzip(value)
}
postMessage({
index: index,
'progress': {
loaded_lang_model:1,
cached: true
}
})
Module.FS_createDataFile('tessdata', lang +".traineddata", value, true, false);
loaded_langs.push(lang)
cb(null, lang)
}
})
}
})
}
}
function circularize(page){
page.paragraphs = []
page.lines = []
page.words = []
page.symbols = []
page.blocks.forEach(function(block){
block.page = page;
block.lines = []
block.words = []
block.symbols = []
block.paragraphs.forEach(function(para){
para.block = block;
para.page = page;
para.words = []
para.symbols = []
para.lines.forEach(function(line){
line.paragraph = para;
line.block = block;
line.page = page;
line.symbols = []
line.words.forEach(function(word){
word.line = line;
word.paragraph = para;
word.block = block;
word.page = page;
word.symbols.forEach(function(sym){
sym.word = word;
sym.line = line;
sym.paragraph = para;
sym.block = block;
sym.page = page;
sym.line.symbols.push(sym)
sym.paragraph.symbols.push(sym)
sym.block.symbols.push(sym)
sym.page.symbols.push(sym)
})
word.paragraph.words.push(word)
word.block.words.push(word)
word.page.words.push(word)
})
line.block.lines.push(line)
line.page.lines.push(line)
})
para.page.paragraphs.push(para)
})
})
return page
}
function DumpLiterallyEverything(){
var ri = base.GetIterator();
var blocks = [];
var block, para, textline, word, symbol;
function enumToString(value, prefix){
return (Object.keys(Module)
.filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' })
.filter(function(e){ return Module[e] === value })
.map(function(e){ return e.slice(prefix.length + 1) })[0])
}
ri.Begin()
do {
if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){
var poly = ri.BlockPolygon();
var polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if(Module.getPointer(poly) > 0){
var n = poly.get_n(),
px = poly.get_x(),
py = poly.get_y(),
polygon = [];
for(var i = 0; i < n; i++){
polygon.push([px.getValue(i), py.getValue(i)]);
}
Module._ptaDestroy(Module.getPointer(poly));
}
block = {
paragraphs: [],
text: ri.GetUTF8Text(Module.RIL_BLOCK),
confidence: ri.Confidence(Module.RIL_BLOCK),
baseline: ri.getBaseline(Module.RIL_BLOCK),
bbox: ri.getBoundingBox(Module.RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon: polygon
}
blocks.push(block)
}
if(ri.IsAtBeginningOf(Module.RIL_PARA)){
para = {
lines: [],
text: ri.GetUTF8Text(Module.RIL_PARA),
confidence: ri.Confidence(Module.RIL_PARA),
baseline: ri.getBaseline(Module.RIL_PARA),
bbox: ri.getBoundingBox(Module.RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr()
}
block.paragraphs.push(para)
}
if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){
textline = {
words: [],
text: ri.GetUTF8Text(Module.RIL_TEXTLINE),
confidence: ri.Confidence(Module.RIL_TEXTLINE),
baseline: ri.getBaseline(Module.RIL_TEXTLINE),
bbox: ri.getBoundingBox(Module.RIL_TEXTLINE)
}
para.lines.push(textline)
}
if(ri.IsAtBeginningOf(Module.RIL_WORD)){
var fontInfo = ri.getWordFontAttributes(),
wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(Module.RIL_WORD),
confidence: ri.Confidence(Module.RIL_WORD),
baseline: ri.getBaseline(Module.RIL_WORD),
bbox: ri.getBoundingBox(Module.RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
}
var wc = new Module.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence()
})
} while (wc.Next());
Module.destroy(wc)
textline.words.push(word)
}
var image = null;
// var pix = ri.GetBinaryImage(Module.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// Module._pixDestroy(Module.getPointer(pix));
if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){
symbol = {
choices: [],
image: image,
text: ri.GetUTF8Text(Module.RIL_SYMBOL),
confidence: ri.Confidence(Module.RIL_SYMBOL),
baseline: ri.getBaseline(Module.RIL_SYMBOL),
bbox: ri.getBoundingBox(Module.RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
}
word.symbols.push(symbol)
var ci = new Module.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence()
})
} while (ci.Next());
Module.destroy(ci)
}
} while (ri.Next(Module.RIL_SYMBOL));
Module.destroy(ri)
return {
text: base.GetUTF8Text(),
html: deindent(base.GetHOCRText()),
confidence: base.MeanTextConf(),
blocks: blocks,
psm: enumToString(base.GetPageSegMode(), 'PSM'),
oem: enumToString(base.oem(), 'OEM'),
version: base.Version(),
}
}
function deindent(html){
var lines = html.split('\n')
if(lines[0].substring(0,2) === " "){
for (var i = 0; i < lines.length; i++) {
if (lines[i].substring(0,2) === " ") {
lines[i] = lines[i].slice(2)
}
};
}
return lines.join('\n')
}
function desaturate(image){
var width, height;
if(image.data){
var src = image.data;
width = image.width, height = image.height;
var dst = new Uint8Array(width * height);
var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0;
for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) {
// convert to grayscale 4 pixels at a time; eveything with alpha get put in front of 50% gray
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16
dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16
dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16
}
for (; i < srcLength; i += 4, ++j) //finish up
dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16
image = dst;
}
else {
throw 'Expected ImageData'
}
return image
}
function recognize(index, image, lang, options, cb){
var width = image.width, height = image.height;
image = desaturate(image)
var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL);
loadLanguage(lang, index, function(err, result){
if(err){
console.error("error loading", lang);
Module._free(ptr);
cb(err, null)
}
else {
curindex = index
base.Init(null, lang)
postMessage({
index: index,
'progress': {
'initialized_with_lang': true,
'lang': lang
}
})
for (var option in options) {
if (options.hasOwnProperty(option)) {
base.SetVariable(option, options[option]);
postMessage({
index: index,
'progress': {
'set_variable': {
variable: option,
value: options[option]
}
}
})
}
}
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width)
base.SetRectangle(0, 0, width, height)
// base.GetUTF8Text()
base.Recognize(null)
var everything = circularize(DumpLiterallyEverything())
base.End();
Module._free(ptr);
cb(null, everything)
}
})
}
function detect(index, image, cb){
var width = image.width, height = image.height;
image = desaturate(image)
var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL);
console.log('allocated image')
// base = new Module.TessBaseAPI()
loadLanguage('osd', index, function(err, result){
if(err){
Module._free(ptr);
cb(err)
}
else {
curindex = index
base.Init(null, 'osd')
base.SetPageSegMode(Module.PSM_OSD_ONLY)
console.log('loaded language')
base.SetImage(Module.wrapPointer(ptr), width, height, 1, width)
base.SetRectangle(0, 0, width, height)
var results = new Module.OSResults();
var success = base.DetectOS(results);
if(!success){
base.End();
Module._free(ptr);
cb("failed to detect os")
}
else {
var charset = results.get_unicharset()
console.log(charset)
// results.print_scores()
var best = results.get_best_result()
var oid = best.get_orientation_id(),
sid = best.get_script_id();
// console.log('orientation id', oid, [0, 270, 180, 90][oid], best.get_oconfidence())
// console.log('script id', sid, charset.get_script_from_script_id(sid), best.get_sconfidence())
// console.log(best)
cb(null, {
tesseract_script_id: sid,
script: charset.get_script_from_script_id(sid),
script_confidence: best.get_sconfidence(),
orientation_degrees: [0, 270, 180, 90][oid],
orientation_confidence: best.get_oconfidence()
})
base.End();
Module._free(ptr);
}
}
})
}
return {
recognize: recognize,
detect: detect
}
})
onmessage = function(e) {
if(e.data.init){
T = tesseractinit(e.data.init.mem)
}
else if(e.data.fun === 'recognize'){
T.recognize(e.data.index, e.data.image, e.data.lang, e.data.options, function(err, result){
postMessage({index: e.data.index, err:err, result: result})
})
}
else if(e.data.fun === 'detect'){
T.detect(e.data.index, e.data.image, function(err, result){
postMessage({index: e.data.index, err:err, result: result})
})
}
}