From 6437f2870461bc27e3c744af9cb6d8090d9b9144 Mon Sep 17 00:00:00 2001 From: Balearica Date: Mon, 29 May 2023 11:40:45 -0700 Subject: [PATCH] Add ability to analyse layout without running recognition (#770) --- src/index.d.ts | 1 + src/worker-script/constants/defaultOutput.js | 2 ++ src/worker-script/index.js | 14 +++++--- src/worker-script/utils/dump.js | 38 +++++++++++--------- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/index.d.ts b/src/index.d.ts index 2a01a75..8b810e0 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -70,6 +70,7 @@ declare namespace Tesseract { interface OutputFormats { text: boolean; blocks: boolean; + layoutBlocks: boolean; hocr: boolean; tsv: boolean; box: boolean; diff --git a/src/worker-script/constants/defaultOutput.js b/src/worker-script/constants/defaultOutput.js index b184e17..0c16697 100644 --- a/src/worker-script/constants/defaultOutput.js +++ b/src/worker-script/constants/defaultOutput.js @@ -5,6 +5,7 @@ module.exports = { text: true, blocks: true, + layoutBlocks: false, hocr: true, tsv: true, box: false, @@ -14,4 +15,5 @@ module.exports = { imageColor: false, imageGrey: false, imageBinary: false, + debug: false, }; diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 9f23409..30e701f 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -255,7 +255,7 @@ const processOutput = (output) => { if (params.tessjs_create_tsv === '1') workingOutput.tsv = true; if (params.tessjs_create_unlv === '1') workingOutput.unlv = true; - const nonRecOutputs = ['imageColor', 'imageGrey', 'imageBinary']; + const nonRecOutputs = ['imageColor', 'imageGrey', 'imageBinary', 'layoutBlocks']; let recOutputCount = 0; for (const prop of Object.keys(output)) { workingOutput[prop] = output[prop]; @@ -267,7 +267,8 @@ const processOutput = (output) => { } } } - return { workingOutput, recOutputCount }; + const skipRecognition = recOutputCount === 0; + return { workingOutput, skipRecognition }; }; // List of options for Tesseract.js (rather than passed through to Tesseract), @@ -302,7 +303,7 @@ const recognize = async ({ } } - const { workingOutput, recOutputCount } = processOutput(output); + const { workingOutput, skipRecognition } = processOutput(output); // When the auto-rotate option is True, setImage is called with no angle, // then the angle is calculated by Tesseract and then setImage is re-called. @@ -352,14 +353,17 @@ const recognize = async ({ api.SetRectangle(rec.left, rec.top, rec.width, rec.height); } - if (recOutputCount > 0) { + if (!skipRecognition) { api.Recognize(null); } else { + if (output.layoutBlocks) { + api.AnalyseLayout(); + } log('Skipping recognition: all output options requiring recognition are disabled.'); } const { pdfTitle } = options; const { pdfTextOnly } = options; - const result = dump(TessModule, api, workingOutput, { pdfTitle, pdfTextOnly }); + const result = dump(TessModule, api, workingOutput, { pdfTitle, pdfTextOnly, skipRecognition }); result.rotateRadians = rotateRadiansFinal; if (output.debug) TessModule.FS.unlink('/debugInternal.txt'); diff --git a/src/worker-script/utils/dump.js b/src/worker-script/utils/dump.js index 7edf31c..9c4d3a1 100644 --- a/src/worker-script/utils/dump.js +++ b/src/worker-script/utils/dump.js @@ -79,7 +79,10 @@ module.exports = (TessModule, api, output, options) => { return TessModule.FS.readFile('/tesseract-ocr.pdf'); }; - if (output.blocks) { + // If output.layoutBlocks is true and options.skipRecognition is true, + // the user wants layout data but text recognition has not been run. + // In this case, fields that require text recognition are skipped. + if (output.blocks || output.layoutBlocks) { ri.Begin(); do { if (ri.IsAtBeginningOf(RIL_BLOCK)) { @@ -102,8 +105,8 @@ module.exports = (TessModule, api, output, options) => { block = { paragraphs: [], - text: ri.GetUTF8Text(RIL_BLOCK), - confidence: ri.Confidence(RIL_BLOCK), + text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null, + confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null, baseline: ri.getBaseline(RIL_BLOCK), bbox: ri.getBoundingBox(RIL_BLOCK), blocktype: enumToString(ri.BlockType(), 'PT'), @@ -114,8 +117,8 @@ module.exports = (TessModule, api, output, options) => { if (ri.IsAtBeginningOf(RIL_PARA)) { para = { lines: [], - text: ri.GetUTF8Text(RIL_PARA), - confidence: ri.Confidence(RIL_PARA), + text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null, + confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null, baseline: ri.getBaseline(RIL_PARA), bbox: ri.getBoundingBox(RIL_PARA), is_ltr: !!ri.ParagraphIsLtr(), @@ -125,8 +128,8 @@ module.exports = (TessModule, api, output, options) => { if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { textline = { words: [], - text: ri.GetUTF8Text(RIL_TEXTLINE), - confidence: ri.Confidence(RIL_TEXTLINE), + text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null, + confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null, baseline: ri.getBaseline(RIL_TEXTLINE), bbox: ri.getBoundingBox(RIL_TEXTLINE), }; @@ -139,8 +142,8 @@ module.exports = (TessModule, api, output, options) => { symbols: [], choices: [], - text: ri.GetUTF8Text(RIL_WORD), - confidence: ri.Confidence(RIL_WORD), + text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null, + confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null, baseline: ri.getBaseline(RIL_WORD), bbox: ri.getBoundingBox(RIL_WORD), @@ -162,8 +165,8 @@ module.exports = (TessModule, api, output, options) => { const wc = new TessModule.WordChoiceIterator(ri); do { word.choices.push({ - text: wc.GetUTF8Text(), - confidence: wc.Confidence(), + text: !options.skipRecognition ? wc.GetUTF8Text() : null, + confidence: !options.skipRecognition ? wc.Confidence() : null, }); } while (wc.Next()); TessModule.destroy(wc); @@ -179,8 +182,8 @@ module.exports = (TessModule, api, output, options) => { symbol = { choices: [], image: null, - text: ri.GetUTF8Text(RIL_SYMBOL), - confidence: ri.Confidence(RIL_SYMBOL), + text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null, + confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null, baseline: ri.getBaseline(RIL_SYMBOL), bbox: ri.getBoundingBox(RIL_SYMBOL), is_superscript: !!ri.SymbolIsSuperscript(), @@ -191,8 +194,8 @@ module.exports = (TessModule, api, output, options) => { const ci = new TessModule.ChoiceIterator(ri); do { symbol.choices.push({ - text: ci.GetUTF8Text(), - confidence: ci.Confidence(), + text: !options.skipRecognition ? ci.GetUTF8Text() : null, + confidence: !options.skipRecognition ? ci.Confidence() : null, }); } while (ci.Next()); // TessModule.destroy(i); @@ -212,8 +215,9 @@ module.exports = (TessModule, api, output, options) => { imageColor: output.imageColor ? getImage(imageType.COLOR) : null, imageGrey: output.imageGrey ? getImage(imageType.GREY) : null, imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null, - confidence: api.MeanTextConf(), - blocks: output.blocks ? blocks : null, + confidence: !options.skipRecognition ? api.MeanTextConf() : null, + blocks: output.blocks && !options.skipRecognition ? blocks : null, + layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null, psm: enumToString(api.GetPageSegMode(), 'PSM'), oem: enumToString(api.oem(), 'OEM'), version: api.Version(),