var Tesseract304 = require('tesseract.js-core') var leveljs = require('level-js') var db; if (typeof indexedDB === 'undefined'){ db = { open: function(opts, cb){ cb(true) /*err = true*/ } } } else { db = leveljs('./tessdata') } console.log('hallo') var filesizes = {"afr": 1079573, "ara": 1701536, "aze": 1420865, "bel": 1276820, "ben": 6772012, "bul": 1605615, "cat": 1652368, "ces": 1035441, "chi_sim": 17710414, "chi_tra": 24717749, "chr": 320649, "dan-frak": 677656, "dan": 1972936, "deu-frak": 822644, "deu": 991656, "ell": 859719, "eng": 9453554, "enm": 619254, "epo": 1241212, "equ": 821130, "est": 1905040, "eus": 1641190, "fin": 979418, "fra": 1376221, "frk": 5912963, "frm": 5147082, "glg": 1674938, "grc": 3012615, "heb": 1051501, "hin": 6590065, "hrv": 1926995, "hun": 3074473, "ind": 1874776, "isl": 1634041, "ita": 948593, "ita_old": 3436571, "jpn": 13507168, "kan": 4390317, "kor": 5353098, "lav": 1843944, "lit": 1779240, "mal": 5966263, "meme": 88453, "mkd": 1163087, "mlt": 1463001, "msa": 1665427, "nld": 1134708, "nor": 2191610, "osd": 4274649, "pol": 7024662, "por": 909359, "ron": 915680, "rus": 5969957, "slk-frak": 289885, "slk": 2217342, "slv": 1611338, "spa": 883170, "spa_old": 5647453, "sqi": 1667041, "srp": 1770244, "swa": 757916, "swe": 2451917, "tam": 3498763, "tel": 5795246, "tgl": 1496256, "tha": 3811136, "tur": 3563264, "ukr": 937566, "vie": 2195922} var pako = require('pako') var T; var tesseractinit = (function createTesseractInstance(memory){ curindex = 0 var Module = Tesseract304({ TOTAL_MEMORY: memory, //must be a multiple of 10 megabytes TesseractProgress: function(percent){ postMessage({ index: curindex, 'progress': { 'recognized': Math.max(0,(percent-30)/70) } }) }//, // onRuntimeInitialized: function(){ // console.log('wau') // } }) var base = new Module.TessBaseAPI() var loaded_langs = [] var loadLanguage = function(lang, index, cb){ // NodeJS style callback if(loaded_langs.indexOf(lang) != -1){ cb(null, lang) } else{ Module.FS_createPath("/","tessdata",true,true) var downloadlang = function(shouldcache){ postMessage({ index: index, 'progress': { 'loaded_lang_model': 0, cached: false, requesting: true } }) var xhr = new XMLHttpRequest(); xhr.open('GET', 'https://cdn.rawgit.com/naptha/tessdata/gh-pages/3.02/'+lang+'.traineddata.gz', true); xhr.responseType = 'arraybuffer'; xhr.onerror = function(){ cb(xhr, null) } xhr.onprogress = function(e){ postMessage({ index: index, 'progress': { 'loaded_lang_model': e.loaded/filesizes[lang], //this is kinda wrong on safari cached: false } }) } xhr.onload = function(){ if (xhr.status == 200 || (xhr.status == 0 && xhr.response)) { postMessage({ index: index, 'progress': 'unzipping_lang_model' }) var response = new Uint8Array(xhr.response) while(response[0] == 0x1f && response[1] == 0x8b){ response = pako.ungzip(response) } console.log('asdf') postMessage({ index: index, 'progress': { 'unzipped_lang_model': true, 'lang_model_size': response.length } }) Module.FS_createDataFile('tessdata', lang +".traineddata", response, true, false); if(shouldcache){ db.put(lang, response, function(err){ console.log('cached lang') }) } postMessage({ index: index, 'progress': { 'created_virtual_datafile': true, 'cached_file': shouldcache } }) loaded_langs.push(lang) cb(null, lang) } else cb(xhr, null); } xhr.send(null) } db.open({compression: false},function(err){ // err = true if (err) { downloadlang(false) } else { db.get(lang, function (err, value) { // err = true if (err) { downloadlang(true) } else { while(value[0] == 0x1f && value[1] == 0x8b){ value = pako.ungzip(value) } postMessage({ index: index, 'progress': { loaded_lang_model:1, cached: true } }) Module.FS_createDataFile('tessdata', lang +".traineddata", value, true, false); loaded_langs.push(lang) cb(null, lang) } }) } }) } } function circularize(page){ page.paragraphs = [] page.lines = [] page.words = [] page.symbols = [] page.blocks.forEach(function(block){ block.page = page; block.lines = [] block.words = [] block.symbols = [] block.paragraphs.forEach(function(para){ para.block = block; para.page = page; para.words = [] para.symbols = [] para.lines.forEach(function(line){ line.paragraph = para; line.block = block; line.page = page; line.symbols = [] line.words.forEach(function(word){ word.line = line; word.paragraph = para; word.block = block; word.page = page; word.symbols.forEach(function(sym){ sym.word = word; sym.line = line; sym.paragraph = para; sym.block = block; sym.page = page; sym.line.symbols.push(sym) sym.paragraph.symbols.push(sym) sym.block.symbols.push(sym) sym.page.symbols.push(sym) }) word.paragraph.words.push(word) word.block.words.push(word) word.page.words.push(word) }) line.block.lines.push(line) line.page.lines.push(line) }) para.page.paragraphs.push(para) }) }) return page } function DumpLiterallyEverything(){ var ri = base.GetIterator(); var blocks = []; var block, para, textline, word, symbol; function enumToString(value, prefix){ return (Object.keys(Module) .filter(function(e){ return e.substr(0, prefix.length + 1) == prefix + '_' }) .filter(function(e){ return Module[e] === value }) .map(function(e){ return e.slice(prefix.length + 1) })[0]) } ri.Begin() do { if(ri.IsAtBeginningOf(Module.RIL_BLOCK)){ var poly = ri.BlockPolygon(); var polygon = null; // BlockPolygon() returns null when automatic page segmentation is off if(Module.getPointer(poly) > 0){ var n = poly.get_n(), px = poly.get_x(), py = poly.get_y(), polygon = []; for(var i = 0; i < n; i++){ polygon.push([px.getValue(i), py.getValue(i)]); } Module._ptaDestroy(Module.getPointer(poly)); } block = { paragraphs: [], text: ri.GetUTF8Text(Module.RIL_BLOCK), confidence: ri.Confidence(Module.RIL_BLOCK), baseline: ri.getBaseline(Module.RIL_BLOCK), bbox: ri.getBoundingBox(Module.RIL_BLOCK), blocktype: enumToString(ri.BlockType(), 'PT'), polygon: polygon } blocks.push(block) } if(ri.IsAtBeginningOf(Module.RIL_PARA)){ para = { lines: [], text: ri.GetUTF8Text(Module.RIL_PARA), confidence: ri.Confidence(Module.RIL_PARA), baseline: ri.getBaseline(Module.RIL_PARA), bbox: ri.getBoundingBox(Module.RIL_PARA), is_ltr: !!ri.ParagraphIsLtr() } block.paragraphs.push(para) } if(ri.IsAtBeginningOf(Module.RIL_TEXTLINE)){ textline = { words: [], text: ri.GetUTF8Text(Module.RIL_TEXTLINE), confidence: ri.Confidence(Module.RIL_TEXTLINE), baseline: ri.getBaseline(Module.RIL_TEXTLINE), bbox: ri.getBoundingBox(Module.RIL_TEXTLINE) } para.lines.push(textline) } if(ri.IsAtBeginningOf(Module.RIL_WORD)){ var fontInfo = ri.getWordFontAttributes(), wordDir = ri.WordDirection(); word = { symbols: [], choices: [], text: ri.GetUTF8Text(Module.RIL_WORD), confidence: ri.Confidence(Module.RIL_WORD), baseline: ri.getBaseline(Module.RIL_WORD), bbox: ri.getBoundingBox(Module.RIL_WORD), is_numeric: !!ri.WordIsNumeric(), in_dictionary: !!ri.WordIsFromDictionary(), direction: enumToString(wordDir, 'DIR'), language: ri.WordRecognitionLanguage(), is_bold: fontInfo.is_bold, is_italic: fontInfo.is_italic, is_underlined: fontInfo.is_underlined, is_monospace: fontInfo.is_monospace, is_serif: fontInfo.is_serif, is_smallcaps: fontInfo.is_smallcaps, font_size: fontInfo.pointsize, font_id: fontInfo.font_id, font_name: fontInfo.font_name, } var wc = new Module.WordChoiceIterator(ri); do { word.choices.push({ text: wc.GetUTF8Text(), confidence: wc.Confidence() }) } while (wc.Next()); Module.destroy(wc) textline.words.push(word) } var image = null; // var pix = ri.GetBinaryImage(Module.RIL_SYMBOL) // var image = pix2array(pix); // // for some reason it seems that things stop working if you destroy pics // Module._pixDestroy(Module.getPointer(pix)); if(ri.IsAtBeginningOf(Module.RIL_SYMBOL)){ symbol = { choices: [], image: image, text: ri.GetUTF8Text(Module.RIL_SYMBOL), confidence: ri.Confidence(Module.RIL_SYMBOL), baseline: ri.getBaseline(Module.RIL_SYMBOL), bbox: ri.getBoundingBox(Module.RIL_SYMBOL), is_superscript: !!ri.SymbolIsSuperscript(), is_subscript: !!ri.SymbolIsSubscript(), is_dropcap: !!ri.SymbolIsDropcap(), } word.symbols.push(symbol) var ci = new Module.ChoiceIterator(ri); do { symbol.choices.push({ text: ci.GetUTF8Text(), confidence: ci.Confidence() }) } while (ci.Next()); Module.destroy(ci) } } while (ri.Next(Module.RIL_SYMBOL)); Module.destroy(ri) return { text: base.GetUTF8Text(), html: deindent(base.GetHOCRText()), confidence: base.MeanTextConf(), blocks: blocks, psm: enumToString(base.GetPageSegMode(), 'PSM'), oem: enumToString(base.oem(), 'OEM'), version: base.Version(), } } function deindent(html){ var lines = html.split('\n') if(lines[0].substring(0,2) === " "){ for (var i = 0; i < lines.length; i++) { if (lines[i].substring(0,2) === " ") { lines[i] = lines[i].slice(2) } }; } return lines.join('\n') } function desaturate(image){ var width, height; if(image.data){ var src = image.data; width = image.width, height = image.height; var dst = new Uint8Array(width * height); var srcLength = src.length | 0, srcLength_16 = (srcLength - 16) | 0; for (var i = 0, j = 0; i <= srcLength_16; i += 16, j += 4) { // convert to grayscale 4 pixels at a time; eveything with alpha get put in front of 50% gray dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 dst[j+1] = (((src[i+4] * 77 + src[i+5] * 151 + src[i+6] * 28) * src[i+7]) + ((255-src[i+7]) << 15) + 32768) >> 16 dst[j+2] = (((src[i+8] * 77 + src[i+9] * 151 + src[i+10] * 28) * src[i+11]) + ((255-src[i+11]) << 15) + 32768) >> 16 dst[j+3] = (((src[i+12] * 77 + src[i+13] * 151 + src[i+14] * 28) * src[i+15]) + ((255-src[i+15]) << 15) + 32768) >> 16 } for (; i < srcLength; i += 4, ++j) //finish up dst[j] = (((src[i] * 77 + src[i+1] * 151 + src[i+2] * 28) * src[i+3]) + ((255-src[i+3]) << 15) + 32768) >> 16 image = dst; } else { throw 'Expected ImageData' } return image } function recognize(index, image, lang, options, cb){ var width = image.width, height = image.height; image = desaturate(image) var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL); loadLanguage(lang, index, function(err, result){ if(err){ console.error("error loading", lang); Module._free(ptr); cb(err, null) } else { curindex = index base.Init(null, lang) postMessage({ index: index, 'progress': { 'initialized_with_lang': true, 'lang': lang } }) for (var option in options) { if (options.hasOwnProperty(option)) { base.SetVariable(option, options[option]); postMessage({ index: index, 'progress': { 'set_variable': { variable: option, value: options[option] } } }) } } base.SetImage(Module.wrapPointer(ptr), width, height, 1, width) base.SetRectangle(0, 0, width, height) // base.GetUTF8Text() base.Recognize(null) var everything = circularize(DumpLiterallyEverything()) base.End(); Module._free(ptr); cb(null, everything) } }) } function detect(index, image, cb){ var width = image.width, height = image.height; image = desaturate(image) var ptr = Module.allocate(image, 'i8', Module.ALLOC_NORMAL); console.log('allocated image') // base = new Module.TessBaseAPI() loadLanguage('osd', index, function(err, result){ if(err){ Module._free(ptr); cb(err) } else { curindex = index base.Init(null, 'osd') base.SetPageSegMode(Module.PSM_OSD_ONLY) console.log('loaded language') base.SetImage(Module.wrapPointer(ptr), width, height, 1, width) base.SetRectangle(0, 0, width, height) var results = new Module.OSResults(); var success = base.DetectOS(results); if(!success){ base.End(); Module._free(ptr); cb("failed to detect os") } else { var charset = results.get_unicharset() console.log(charset) // results.print_scores() var best = results.get_best_result() var oid = best.get_orientation_id(), sid = best.get_script_id(); // console.log('orientation id', oid, [0, 270, 180, 90][oid], best.get_oconfidence()) // console.log('script id', sid, charset.get_script_from_script_id(sid), best.get_sconfidence()) // console.log(best) cb(null, { tesseract_script_id: sid, script: charset.get_script_from_script_id(sid), script_confidence: best.get_sconfidence(), orientation_degrees: [0, 270, 180, 90][oid], orientation_confidence: best.get_oconfidence() }) base.End(); Module._free(ptr); } } }) } return { recognize: recognize, detect: detect } }) onmessage = function(e) { if(e.data.init){ T = tesseractinit(e.data.init.mem) } else if(e.data.fun === 'recognize'){ T.recognize(e.data.index, e.data.image, e.data.lang, e.data.options, function(err, result){ postMessage({index: e.data.index, err:err, result: result}) }) } else if(e.data.fun === 'detect'){ T.detect(e.data.index, e.data.image, function(err, result){ postMessage({index: e.data.index, err:err, result: result}) }) } }