Update to Tesseract.js Version 4 (#691)

See #662 for explanation of Tesseract.js Version 4 changes.  List below is auto-generated from commits. 

* Added image preprocessing functions (rotate + save images)

* Updated createWorker to be async

* Reworked createWorker to be async and throw errors per #654

* Reworked createWorker to be async and throw errors per #654

* Edited detect to return null when detection fails rather than throwing error per #526

* Updated types per #606 and #580 (#663) (#664)

* Removed unused files

* Added savePDF option to recognize per #488; cleaned up code for linter

* Updated download-pdf example for node to use new savePDF option

* Added OutputFormats option/interface for setting output

* Allowed for Tesseract parameters to be set through recognition options per #665

* Updated docs

* Edited loadLanguage to no longer overwrite cache with data from cache per #666

* Added interface for setting 'init only' options per #613

* Wrapped caching in try block per #609

* Fixed unit tests

* Updated setImage to resolve memory leak per #678

* Added debug output option per #681

* Fixed bug with saving images per #588

* Updated examples

* Updated readme and Tesseract.js-core version
master
Balearica 2 years ago committed by GitHub
parent 80aef15861
commit 5ccb3d9cee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 5
      .eslintrc
  2. 13
      README.md
  3. 44
      docs/api.md
  4. 41
      docs/examples.md
  5. 40
      docs/faq.md
  6. 4
      docs/local-installation.md
  7. 37
      examples/browser/basic-edge.html
  8. 34
      examples/browser/basic-efficient.html
  9. 7
      examples/browser/basic.html
  10. 8
      examples/browser/benchmark.html
  11. 4
      examples/browser/demo.html
  12. 14
      examples/browser/download-pdf.html
  13. 59
      examples/browser/image-processing.html
  14. 4
      examples/node/benchmark.js
  15. 8
      examples/node/download-pdf.js
  16. 7
      examples/node/recognize.js
  17. 26
      examples/node/scheduler.js
  18. 2957
      package-lock.json
  19. 6
      package.json
  20. 6
      src/Tesseract.js
  21. 5
      src/constants/imageType.js
  22. 44
      src/createWorker.js
  23. 55
      src/index.d.ts
  24. 36
      src/utils/circularize.js
  25. 17
      src/worker-script/constants/defaultOutput.js
  26. 230
      src/worker-script/index.js
  27. 56
      src/worker-script/utils/arrayBufferToBase64.js
  28. 275
      src/worker-script/utils/dump.js
  29. 58
      src/worker-script/utils/setImage.js
  30. 6
      tests/FS.test.js
  31. 18
      tests/detect.test.js
  32. 17
      tests/recognize.test.js
  33. 3
      tests/scheduler.test.js

@ -12,6 +12,9 @@
"no-console": 0, "no-console": 0,
"global-require": 0, "global-require": 0,
"camelcase": 0, "camelcase": 0,
"no-control-regex": 0 "no-control-regex": 0,
// Airbnb disallows ForOfStatement based on the bizarre belief that loops are not readable
// https://github.com/airbnb/javascript/issues/1271
"no-restricted-syntax": ["error", "ForInStatement", "LabeledStatement", "WithStatement"]
} }
} }

@ -46,12 +46,11 @@ Or more imperative
```javascript ```javascript
import { createWorker } from 'tesseract.js'; import { createWorker } from 'tesseract.js';
const worker = createWorker({ const worker = await createWorker({
logger: m => console.log(m) logger: m => console.log(m)
}); });
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -62,6 +61,16 @@ const worker = createWorker({
[Check out the docs](#documentation) for a full explanation of the API. [Check out the docs](#documentation) for a full explanation of the API.
## Major changes in v4
Version 4 includes many new features and bug fixes--see [this issue](https://github.com/naptha/tesseract.js/issues/662) for a full list. Several highlights are below.
- Added rotation preprocessing options (including auto-rotate) for significantly better accuracy
- Processed images (rotated, grayscale, binary) can now be retrieved
- Improved support for parallel processing (schedulers)
- Breaking changes:
- `createWorker` is now async
- `getPDF` function replaced by `pdf` recognize option
## Major changes in v3 ## Major changes in v3
- Significantly faster performance - Significantly faster performance
- Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data) - Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data)

@ -1,7 +1,6 @@
# API # API
- [createWorker()](#create-worker) - [createWorker()](#create-worker)
- [Worker.load](#worker-load)
- [Worker.writeText](#worker-writeText) - [Worker.writeText](#worker-writeText)
- [Worker.readText](#worker-readText) - [Worker.readText](#worker-readText)
- [Worker.removeFile](#worker-removeFile) - [Worker.removeFile](#worker-removeFile)
@ -53,7 +52,7 @@ createWorker is a factory function that creates a tesseract worker, a worker is
```javascript ```javascript
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
const worker = createWorker({ const worker = await createWorker({
langPath: '...', langPath: '...',
logger: m => console.log(m), logger: m => console.log(m),
}); });
@ -63,7 +62,6 @@ const worker = createWorker({
A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is: A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is:
- load
- FS functions // optional - FS functions // optional
- loadLanguauge - loadLanguauge
- initialize - initialize
@ -82,23 +80,6 @@ Each function is async, so using async/await or Promise is required. When it is
jobId is generated by Tesseract.js, but you can put your own when calling any of the function above. jobId is generated by Tesseract.js, but you can put your own when calling any of the function above.
<a name="worker-load"></a>
### Worker.load(jobId): Promise
Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action.
**Arguments:**
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.load();
})();
```
<a name="worker-writeText"></a> <a name="worker-writeText"></a>
### Worker.writeText(path, text, jobId): Promise ### Worker.writeText(path, text, jobId): Promise
@ -225,7 +206,7 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i
- `params` an object with key and value of the parameters - `params` an object with key and value of the parameters
- `jobId` Please see details above - `jobId` Please see details above
**Supported Paramters:** **Useful Paramters:**
| name | type | default value | description | | name | type | default value | description |
| --------------------------- | ------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------- | | --------------------------- | ------ | ----------------- | ------------------------------------------------------------------------------------------------------------------------------- |
@ -234,11 +215,8 @@ Worker.setParameters() set parameters for Tesseract API (using SetVariable()), i
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |
| preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words | | preserve\_interword\_spaces | string | '0' | '0' or '1', keeps the space between words |
| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** | | user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | This list is incomplete. As Tesseract.js passes parameters to the Tesseract engine, all parameters supported by the underlying version of Tesseract should also be supported by Tesseract.js. (Note that parameters marked as “init only” in Tesseract documentation cannot be set by `setParameters` or `recognize`.)
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
**Examples:** **Examples:**
@ -262,8 +240,9 @@ Figures out what words are in `image`, where the words are in `image`, etc.
**Arguments:** **Arguments:**
- `image` see [Image Format](./image-format.md) for more details. - `image` see [Image Format](./image-format.md) for more details.
- `options` a object of customized options - `options` an object of customized options
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below. - `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below.
- `output` an object specifying which output formats to return (by default `text`, `blocks`, `hocr`, and `tsv` are returned)
- `jobId` Please see details above - `jobId` Please see details above
**Output:** **Output:**
@ -273,8 +252,7 @@ Figures out what words are in `image`, where the words are in `image`, etc.
```javascript ```javascript
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
(async () => { (async () => {
const worker = createWorker(); const worker = await createWorker();
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image); const { data: { text } } = await worker.recognize(image);
@ -287,8 +265,7 @@ With rectangle
```javascript ```javascript
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
(async () => { (async () => {
const worker = createWorker(); const worker = await createWorker();
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image, { const { data: { text } } = await worker.recognize(image, {
@ -313,8 +290,7 @@ Worker.detect() does OSD (Orientation and Script Detection) to the image instead
```javascript ```javascript
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
(async () => { (async () => {
const worker = createWorker(); const worker = await createWorker();
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data } = await worker.detect(image); const { data } = await worker.detect(image);
@ -361,7 +337,7 @@ Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is
```javascript ```javascript
const { createWorker, createScheduler } = Tesseract; const { createWorker, createScheduler } = Tesseract;
const scheduler = createScheduler(); const scheduler = createScheduler();
const worker = createWorker(); const worker = await createWorker();
scheduler.addWorker(worker); scheduler.addWorker(worker);
``` ```

@ -7,10 +7,9 @@ You can also check [examples](../examples) folder.
```javascript ```javascript
const { createWorker } = require('tesseract.js'); const { createWorker } = require('tesseract.js');
const worker = createWorker(); const worker = await createWorker();
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -24,12 +23,11 @@ const worker = createWorker();
```javascript ```javascript
const { createWorker } = require('tesseract.js'); const { createWorker } = require('tesseract.js');
const worker = createWorker({ const worker = await createWorker({
logger: m => console.log(m), // Add logger here logger: m => console.log(m), // Add logger here
}); });
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -43,10 +41,9 @@ const worker = createWorker({
```javascript ```javascript
const { createWorker } = require('tesseract.js'); const { createWorker } = require('tesseract.js');
const worker = createWorker(); const worker = await createWorker();
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng+chi_tra'); await worker.loadLanguage('eng+chi_tra');
await worker.initialize('eng+chi_tra'); await worker.initialize('eng+chi_tra');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
@ -54,15 +51,14 @@ const worker = createWorker();
await worker.terminate(); await worker.terminate();
})(); })();
``` ```
### with whitelist char (^2.0.0-beta.1) ### with whitelist char
```javascript ```javascript
const { createWorker } = require('tesseract.js'); const { createWorker } = require('tesseract.js');
const worker = createWorker(); const worker = await createWorker();
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
await worker.setParameters({ await worker.setParameters({
@ -74,17 +70,16 @@ const worker = createWorker();
})(); })();
``` ```
### with different pageseg mode (^2.0.0-beta.1) ### with different pageseg mode
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163 Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163
```javascript ```javascript
const { createWorker, PSM } = require('tesseract.js'); const { createWorker, PSM } = require('tesseract.js');
const worker = createWorker(); const worker = await createWorker();
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
await worker.setParameters({ await worker.setParameters({
@ -96,7 +91,7 @@ const worker = createWorker();
})(); })();
``` ```
### with pdf output (^2.0.0-beta.1) ### with pdf output
Please check **examples** folder for details. Please check **examples** folder for details.
@ -110,11 +105,10 @@ Node: [download-pdf.js](../examples/node/download-pdf.js)
```javascript ```javascript
const { createWorker } = require('tesseract.js'); const { createWorker } = require('tesseract.js');
const worker = createWorker(); const worker = await createWorker();
const rectangle = { left: 0, top: 0, width: 500, height: 250 }; const rectangle = { left: 0, top: 0, width: 500, height: 250 };
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle }); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle });
@ -128,7 +122,7 @@ const rectangle = { left: 0, top: 0, width: 500, height: 250 };
```javascript ```javascript
const { createWorker } = require('tesseract.js'); const { createWorker } = require('tesseract.js');
const worker = createWorker(); const worker = await createWorker();
const rectangles = [ const rectangles = [
{ {
left: 0, left: 0,
@ -145,7 +139,6 @@ const rectangles = [
]; ];
(async () => { (async () => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const values = []; const values = [];
@ -164,8 +157,8 @@ const rectangles = [
const { createWorker, createScheduler } = require('tesseract.js'); const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler(); const scheduler = createScheduler();
const worker1 = createWorker(); const worker1 = await createWorker();
const worker2 = createWorker(); const worker2 = await createWorker();
const rectangles = [ const rectangles = [
{ {
left: 0, left: 0,
@ -182,8 +175,6 @@ const rectangles = [
]; ];
(async () => { (async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng'); await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng'); await worker2.loadLanguage('eng');
await worker1.initialize('eng'); await worker1.initialize('eng');
@ -198,18 +189,16 @@ const rectangles = [
})(); })();
``` ```
### with multiple workers to speed up (^2.0.0-beta.1) ### with multiple workers to speed up
```javascript ```javascript
const { createWorker, createScheduler } = require('tesseract.js'); const { createWorker, createScheduler } = require('tesseract.js');
const scheduler = createScheduler(); const scheduler = createScheduler();
const worker1 = createWorker(); const worker1 = await createWorker();
const worker2 = createWorker(); const worker2 = await createWorker();
(async () => { (async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng'); await worker1.loadLanguage('eng');
await worker2.loadLanguage('eng'); await worker2.loadLanguage('eng');
await worker1.initialize('eng'); await worker1.initialize('eng');

@ -1,6 +1,13 @@
FAQ FAQ
=== ===
# Project
## What is the scope of this project?
Tesseract.js is the JavaScript/Webassembly port of the Tesseract OCR engine. We do not edit the underlying Tesseract recognition engine in any way. Therefore, if you encounter bugs caused by the Tesseract engine you may open an issue here for the purposes of raising awareness to other users, but fixing is outside the scope of this repository.
If you encounter a Tesseract bug you would like to see fixed you should confirm the behavior is the same in the [main (CLI) version](https://github.com/tesseract-ocr/tesseract) of Tesseract and then open a Git Issue in that repository.
# Trained Data
## How does tesseract.js download and keep \*.traineddata? ## How does tesseract.js download and keep \*.traineddata?
The language model is downloaded by `worker.loadLanguage()` and you need to pass the langs to `worker.initialize()`. The language model is downloaded by `worker.loadLanguage()` and you need to pass the langs to `worker.initialize()`.
@ -9,34 +16,5 @@ During the downloading of language model, Tesseract.js will first check if \*.tr
## How can I train my own \*.traineddata? ## How can I train my own \*.traineddata?
For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00) See the documentation from the main [Tesseract project](https://tesseract-ocr.github.io/tessdoc/) for training instructions.
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05)
## How can I get HOCR, TSV, Box, UNLV, OSD?
Starting from 2.0.0-beta.1, you can get all these information in the final result.
```javascript
import { createWorker } from 'tesseract.js';
const worker = createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_create_box: '1',
tessedit_create_unlv: '1',
tessedit_create_osd: '1',
});
const { data: { text, hocr, tsv, box, unlv } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
console.log(hocr);
console.log(tsv);
console.log(box);
console.log(unlv);
})();
```

@ -19,7 +19,7 @@ Tesseract.recognize(image, langs, {
Or Or
```javascript ```javascript
const worker = createWorker({ const worker = await createWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0', langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js', corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js',
@ -33,6 +33,6 @@ A string specifying the location of the [worker.js](./dist/worker.min.js) file.
A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`. A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`.
### corePath ### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available). A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js'.
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment. Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment.

@ -1,37 +0,0 @@
<!DOCTYPE HTML>
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<input type="file" id="uploader">
<script>
const recognize = function(evt){
const files = evt.target.files;
const worker = Tesseract.createWorker({
/*
* As Edge don't support webassembly,
* here we force to use asm.js version.
*/
corePath: '../../node_modules/tesseract.js-core/tesseract-core.asm.js',
logger: function(m){console.log(m);},
/*
* As there is no indexedDB in earlier version
* of Edge, here we disable cache.
*/
cacheMethod: 'none',
});
Promise.resolve()
.then(() => worker.load())
.then(() => worker.loadLanguage('eng'))
.then(() => worker.initialize('eng'))
.then(() => worker.recognize(files[0]))
.then((ret) => {
console.log(ret.data.text);
});
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

@ -0,0 +1,34 @@
<!DOCTYPE HTML>
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
</head>
<body>
<input type="file" id="uploader">
<script type="module">
// This is a basic example more efficient than "basic.html".
// In this example we create a worker once, and this worker is re-used
// every time the user uploads a new file.
const worker = await Tesseract.createWorker({
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: function(m){console.log(m);}
});
await worker.loadLanguage('eng');
await worker.initialize('eng');
const recognize = async function(evt){
const files = evt.target.files;
const ret = await worker.recognize(files[0]);
console.log(ret.data.text);
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

@ -5,9 +5,16 @@
<body> <body>
<input type="file" id="uploader"> <input type="file" id="uploader">
<script> <script>
// This is the most basic example (contains a single function call).
// However, in cases when multiple recognition jobs are run,
// calling Tesseract.recognize() each time is inefficient.
// See "basic-efficient.html" for a more efficient example.
const recognize = async ({ target: { files } }) => { const recognize = async ({ target: { files } }) => {
const { data: { text } } = await Tesseract.recognize(files[0], 'eng', { const { data: { text } } = await Tesseract.recognize(files[0], 'eng', {
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js', corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m), logger: m => console.log(m),
}); });
console.log(text); console.log(text);

@ -6,10 +6,14 @@
<textarea id="message">Working...</textarea> <textarea id="message">Working...</textarea>
<script> <script>
// This example provides a standardized performance benchmark.
// It does not accept user input.
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
const worker = createWorker();
(async () => { (async () => {
await worker.load(); const worker = await createWorker();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');

@ -39,9 +39,7 @@ function progressUpdate(packet){
async function recognizeFile(file) { async function recognizeFile(file) {
document.querySelector("#log").innerHTML = '' document.querySelector("#log").innerHTML = ''
const corePath = window.navigator.userAgent.indexOf("Edge") > -1 const corePath = '../../node_modules/tesseract.js-core/tesseract-core.wasm.js';
? '../../node_modules/tesseract.js-core/tesseract-core.asm.js'
: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js';
const lang = document.querySelector('#langsel').value const lang = document.querySelector('#langsel').value
const data = await Tesseract.recognize(file, lang, { const data = await Tesseract.recognize(file, lang, {

@ -8,27 +8,29 @@
<button id="download-pdf" disabled="true">Download PDF</button> <button id="download-pdf" disabled="true">Download PDF</button>
</div> </div>
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea> <textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea>
<script> <script type="module">
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
const worker = createWorker({ const worker = await createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js', corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m), logger: m => console.log(m),
}); });
const uploader = document.getElementById('uploader'); const uploader = document.getElementById('uploader');
const dlBtn = document.getElementById('download-pdf'); const dlBtn = document.getElementById('download-pdf');
let pdf;
const recognize = async ({ target: { files } }) => { const recognize = async ({ target: { files } }) => {
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(files[0]); const res = await worker.recognize(files[0],{pdfTitle: "Example PDF"},{pdf: true});
pdf = res.data.pdf;
const text = res.data.text;
const board = document.getElementById('board'); const board = document.getElementById('board');
board.value = text; board.value = text;
dlBtn.disabled = false; dlBtn.disabled = false;
}; };
const downloadPDF = async () => { const downloadPDF = async () => {
const filename = 'tesseract-ocr-result.pdf'; const filename = 'tesseract-ocr-result.pdf';
const { data } = await worker.getPDF('Tesseract OCR Result'); const blob = new Blob([new Uint8Array(pdf)], { type: 'application/pdf' });
const blob = new Blob([new Uint8Array(data)], { type: 'application/pdf' });
if (navigator.msSaveBlob) { if (navigator.msSaveBlob) {
// IE 10+ // IE 10+
navigator.msSaveBlob(blob, filename); navigator.msSaveBlob(blob, filename);

@ -0,0 +1,59 @@
<html>
<head>
<script src="/dist/tesseract.dev.js"></script>
<style>
.column {
float: left;
width: 20%;
padding: 5px;
}
</style>
</head>
<body>
<input type="file" id="uploader">
<div class="row">
<div class="column">
<p>Input Image</p>
<img id="imgInput" style="max-width:500px;">
</div>
<div class="column">
<p>Rotated, Original Color</p>
<img id="imgOriginal" style="max-width:500px;">
</div>
<div class="column">
<p>Rotated, Grey</p>
<img id="imgGrey" style="max-width:500px;">
</div>
<div class="column">
<p>Rotated, Binary</p>
<img id="imgBinary" style="max-width:500px;">
</div>
</div>
<script>
const recognize = async ({ target: { files } }) => {
document.getElementById("imgInput").src = URL.createObjectURL(files[0]);
const worker = await Tesseract.createWorker({
// corePath: '/tesseract-core-simd.wasm.js',
workerPath: "/dist/worker.dev.js"
});
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.initialize();
const ret = await worker.recognize(files[0], {rotateAuto: true}, {imageColor: true, imageGrey: true, imageBinary: true});
document.getElementById("imgOriginal").src = ret.data.imageColor;
document.getElementById("imgGrey").src = ret.data.imageGrey;
document.getElementById("imgBinary").src = ret.data.imageBinary;
}
const elm = document.getElementById('uploader');
elm.addEventListener('change', recognize);
</script>
</body>
</html>

@ -2,10 +2,8 @@
const path = require('path'); const path = require('path');
const { createWorker } = require('../../'); const { createWorker } = require('../../');
const worker = createWorker();
(async () => { (async () => {
await worker.load(); const worker = await createWorker();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"]; const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"];

@ -9,14 +9,12 @@ const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/c
console.log(`Recognizing ${image}`); console.log(`Recognizing ${image}`);
(async () => { (async () => {
const worker = createWorker(); const worker = await createWorker();
await worker.load();
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image); const { data: { text, pdf } } = await worker.recognize(image, {pdfTitle: "Example PDF"}, {pdf: true});
console.log(text); console.log(text);
const { data } = await worker.getPDF('Tesseract OCR Result'); fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(pdf));
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(data));
console.log('Generate PDF: tesseract-ocr-result.pdf'); console.log('Generate PDF: tesseract-ocr-result.pdf');
await worker.terminate(); await worker.terminate();
})(); })();

@ -6,12 +6,11 @@ const [,, imagePath] = process.argv;
const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png'));
console.log(`Recognizing ${image}`); console.log(`Recognizing ${image}`);
const worker = createWorker({
logger: m => console.log(m),
});
(async () => { (async () => {
await worker.load(); const worker = await createWorker({
logger: m => console.log(m),
});
await worker.loadLanguage('eng'); await worker.loadLanguage('eng');
await worker.initialize('eng'); await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image); const { data: { text } } = await worker.recognize(image);

@ -0,0 +1,26 @@
const { createWorker, createScheduler } = require('../../');
const scheduler = createScheduler();
// Creates worker and adds to scheduler
const workerGen = async () => {
const worker = createWorker({cachePath: "."});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
scheduler.addWorker(worker);
}
const workerN = 4;
(async () => {
const resArr = Array(workerN);
for (let i=0; i<workerN; i++) {
resArr[i] = workerGen();
}
await Promise.all(resArr);
/** Add 4 recognition jobs */
const results = await Promise.all(Array(10).fill(0).map(() => (
scheduler.addJob('recognize', 'https://tesseract.projectnaptha.com/img/eng_bw.png').then((x) => console.log(x.data.text))
)))
await scheduler.terminate(); // It also terminates all workers.
})();

2957
package-lock.json generated

File diff suppressed because it is too large Load Diff

@ -47,8 +47,8 @@
"eslint-plugin-import": "^2.22.1", "eslint-plugin-import": "^2.22.1",
"expect.js": "^0.3.1", "expect.js": "^0.3.1",
"express": "^4.17.1", "express": "^4.17.1",
"mocha": "^8.1.3", "mocha": "^10.0.0",
"mocha-headless-chrome": "^2.0.3", "mocha-headless-chrome": "^4.0.0",
"npm-run-all": "^4.1.5", "npm-run-all": "^4.1.5",
"nyc": "^15.1.0", "nyc": "^15.1.0",
"rimraf": "^2.7.1", "rimraf": "^2.7.1",
@ -70,7 +70,7 @@
"opencollective-postinstall": "^2.0.2", "opencollective-postinstall": "^2.0.2",
"regenerator-runtime": "^0.13.3", "regenerator-runtime": "^0.13.3",
"resolve-url": "^0.2.1", "resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.2", "tesseract.js-core": "^4.0.0",
"wasm-feature-detect": "^1.2.11", "wasm-feature-detect": "^1.2.11",
"zlibjs": "^0.3.1" "zlibjs": "^0.3.1"
}, },

@ -1,8 +1,7 @@
const createWorker = require('./createWorker'); const createWorker = require('./createWorker');
const recognize = async (image, langs, options) => { const recognize = async (image, langs, options) => {
const worker = createWorker(options); const worker = await createWorker(options);
await worker.load();
await worker.loadLanguage(langs); await worker.loadLanguage(langs);
await worker.initialize(langs); await worker.initialize(langs);
return worker.recognize(image) return worker.recognize(image)
@ -12,8 +11,7 @@ const recognize = async (image, langs, options) => {
}; };
const detect = async (image, options) => { const detect = async (image, options) => {
const worker = createWorker(options); const worker = await createWorker(options);
await worker.load();
await worker.loadLanguage('osd'); await worker.loadLanguage('osd');
await worker.initialize('osd'); await worker.initialize('osd');
return worker.detect(image) return worker.detect(image)

@ -0,0 +1,5 @@
module.exports = {
COLOR: 0,
GREY: 1,
BINARY: 2,
};

@ -15,7 +15,7 @@ const {
let workerCounter = 0; let workerCounter = 0;
module.exports = (_options = {}) => { module.exports = async (_options = {}) => {
const id = getId('Worker', workerCounter); const id = getId('Worker', workerCounter);
const { const {
logger, logger,
@ -27,7 +27,17 @@ module.exports = (_options = {}) => {
}); });
const resolves = {}; const resolves = {};
const rejects = {}; const rejects = {};
let workerResReject;
let workerResResolve;
const workerRes = new Promise((resolve, reject) => {
workerResResolve = resolve;
workerResReject = reject;
});
const workerError = (event) => { workerResReject(event.message); };
let worker = spawnWorker(options); let worker = spawnWorker(options);
worker.onerror = workerError;
workerCounter += 1; workerCounter += 1;
@ -53,7 +63,11 @@ module.exports = (_options = {}) => {
}) })
); );
const load = (jobId) => ( const load = () => (
console.warn('`load` is depreciated and should be removed from code (workers now come pre-loaded)')
);
const loadInternal = (jobId) => (
startJob(createJob({ startJob(createJob({
id: jobId, action: 'load', payload: { options }, id: jobId, action: 'load', payload: { options },
})) }))
@ -99,11 +113,11 @@ module.exports = (_options = {}) => {
})) }))
); );
const initialize = (langs = 'eng', oem = defaultOEM, jobId) => ( const initialize = (langs = 'eng', oem = defaultOEM, config, jobId) => (
startJob(createJob({ startJob(createJob({
id: jobId, id: jobId,
action: 'initialize', action: 'initialize',
payload: { langs, oem }, payload: { langs, oem, config },
})) }))
); );
@ -115,21 +129,24 @@ module.exports = (_options = {}) => {
})) }))
); );
const recognize = async (image, opts = {}, jobId) => ( const recognize = async (image, opts = {}, output = {
blocks: true, text: true, hocr: true, tsv: true,
}, jobId) => (
startJob(createJob({ startJob(createJob({
id: jobId, id: jobId,
action: 'recognize', action: 'recognize',
payload: { image: await loadImage(image), options: opts }, payload: { image: await loadImage(image), options: opts, output },
})) }))
); );
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => ( const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => {
startJob(createJob({ console.log('`getPDF` function is depreciated. `recognize` option `savePDF` should be used instead.');
return startJob(createJob({
id: jobId, id: jobId,
action: 'getPDF', action: 'getPDF',
payload: { title, textonly }, payload: { title, textonly },
})) }));
); };
const detect = async (image, jobId) => ( const detect = async (image, jobId) => (
startJob(createJob({ startJob(createJob({
@ -167,6 +184,7 @@ module.exports = (_options = {}) => {
resolves[action]({ jobId, data: d }); resolves[action]({ jobId, data: d });
} else if (status === 'reject') { } else if (status === 'reject') {
rejects[action](data); rejects[action](data);
if (action === 'load') workerResReject(data);
if (errorHandler) { if (errorHandler) {
errorHandler(data); errorHandler(data);
} else { } else {
@ -177,7 +195,7 @@ module.exports = (_options = {}) => {
} }
}); });
return { const resolveObj = {
id, id,
worker, worker,
setResolve, setResolve,
@ -195,4 +213,8 @@ module.exports = (_options = {}) => {
detect, detect,
terminate, terminate,
}; };
loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {});
return workerRes;
}; };

55
src/index.d.ts vendored

@ -1,6 +1,6 @@
declare namespace Tesseract { declare namespace Tesseract {
function createScheduler(): Scheduler function createScheduler(): Scheduler
function createWorker(options?: Partial<WorkerOptions>): Worker function createWorker(options?: Partial<WorkerOptions>): Promise<Worker>
function setLogging(logging: boolean): void function setLogging(logging: boolean): void
function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult> function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult>
function detect(image: ImageLike, options?: Partial<WorkerOptions>): any function detect(image: ImageLike, options?: Partial<WorkerOptions>): any
@ -20,9 +20,10 @@ declare namespace Tesseract {
removeText(path: string, jobId?: string): Promise<ConfigResult> removeText(path: string, jobId?: string): Promise<ConfigResult>
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult> FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult> loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, jobId?: string): Promise<ConfigResult> initialize(langs?: string | Lang[], oem?: OEM, config?: string | Partial<InitOptions>, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult> setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult> getImage(type: imageType): string
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, output?: Partial<OutputFormats>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult> detect(image: ImageLike, jobId?: string): Promise<DetectResult>
terminate(jobId?: string): Promise<ConfigResult> terminate(jobId?: string): Promise<ConfigResult>
getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult> getPDF(title?: string, textonly?: boolean, jobId?: string):Promise<GetPDFResult>
@ -33,6 +34,14 @@ declare namespace Tesseract {
data: unknown; data: unknown;
} }
interface InitOptions {
load_system_dawg: string
load_freq_dawg: string
load_unambig_dawg: string
load_punc_dawg: string
load_number_dawg: string
load_bigram_dawg: string
}
interface WorkerOptions { interface WorkerOptions {
corePath: string corePath: string
langPath: string langPath: string
@ -57,8 +66,26 @@ declare namespace Tesseract {
tessjs_create_unlv: string tessjs_create_unlv: string
tessjs_create_osd: string tessjs_create_osd: string
} }
interface OutputFormats {
text: boolean;
blocks: boolean;
hocr: boolean;
tsv: boolean;
box: boolean;
unlv: boolean;
osd: boolean;
pdf: boolean;
imageColor: boolean;
imageGrey: boolean;
imageBinary: boolean;
debug: boolean;
}
interface RecognizeOptions { interface RecognizeOptions {
rectangle: Rectangle rectangle: Rectangle
pdfTitle: string
pdfTextOnly: boolean
rotateAuto: boolean
rotateRadians: number
} }
interface ConfigResult { interface ConfigResult {
jobId: string jobId: string
@ -77,11 +104,11 @@ declare namespace Tesseract {
data: DetectData data: DetectData
} }
interface DetectData { interface DetectData {
tesseract_script_id: number tesseract_script_id: number | null
script: string script: string | null
script_confidence: number script_confidence: number | null
orientation_degrees: number orientation_degrees: number | null
orientation_confidence: number orientation_confidence: number | null
} }
interface Rectangle { interface Rectangle {
left: number left: number
@ -111,6 +138,11 @@ declare namespace Tesseract {
SPARSE_TEXT_OSD = '12', SPARSE_TEXT_OSD = '12',
RAW_LINE = '13' RAW_LINE = '13'
} }
const enum imageType {
COLOR = 0,
GREY = 1,
BINARY = 2
}
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
| CanvasRenderingContext2D | File | Blob | ImageData | Buffer; | CanvasRenderingContext2D | File | Blob | ImageData | Buffer;
interface Block { interface Block {
@ -208,7 +240,7 @@ declare namespace Tesseract {
page: Page; page: Page;
} }
interface Page { interface Page {
blocks: Block[]; blocks: Block[] | null;
confidence: number; confidence: number;
lines: Line[]; lines: Line[];
oem: string; oem: string;
@ -224,6 +256,11 @@ declare namespace Tesseract {
box: string | null; box: string | null;
unlv: string | null; unlv: string | null;
sd: string | null; sd: string | null;
imageColor: string | null;
imageGrey: string | null;
imageBinary: string | null;
rotateRadians: number | null;
pdf: number[] | null;
} }
} }

@ -22,31 +22,33 @@ module.exports = (page) => {
const words = []; const words = [];
const symbols = []; const symbols = [];
page.blocks.forEach((block) => { if (page.blocks) {
block.paragraphs.forEach((paragraph) => { page.blocks.forEach((block) => {
paragraph.lines.forEach((line) => { block.paragraphs.forEach((paragraph) => {
line.words.forEach((word) => { paragraph.lines.forEach((line) => {
word.symbols.forEach((sym) => { line.words.forEach((word) => {
symbols.push({ word.symbols.forEach((sym) => {
...sym, page, block, paragraph, line, word, symbols.push({
...sym, page, block, paragraph, line, word,
});
});
words.push({
...word, page, block, paragraph, line,
}); });
}); });
words.push({ lines.push({
...word, page, block, paragraph, line, ...line, page, block, paragraph,
}); });
}); });
lines.push({ paragraphs.push({
...line, page, block, paragraph, ...paragraph, page, block,
}); });
}); });
paragraphs.push({ blocks.push({
...paragraph, page, block, ...block, page,
}); });
}); });
blocks.push({ }
...block, page,
});
});
return { return {
...page, blocks, paragraphs, lines, words, symbols, ...page, blocks, paragraphs, lines, words, symbols,

@ -0,0 +1,17 @@
/*
* default output formats for tesseract.js
*/
module.exports = {
text: true,
blocks: true,
hocr: true,
tsv: true,
box: false,
unlv: false,
osd: false,
pdf: false,
imageColor: false,
imageGrey: false,
imageBinary: false,
};

@ -14,7 +14,9 @@ const dump = require('./utils/dump');
const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker'; const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker';
const setImage = require('./utils/setImage'); const setImage = require('./utils/setImage');
const defaultParams = require('./constants/defaultParams'); const defaultParams = require('./constants/defaultParams');
const defaultOutput = require('./constants/defaultOutput');
const { log, setLogging } = require('../utils/log'); const { log, setLogging } = require('../utils/log');
const PSM = require('../constants/PSM');
/* /*
* Tesseract Module returned by TesseractCore. * Tesseract Module returned by TesseractCore.
@ -54,7 +56,7 @@ const load = async ({ workerId, jobId, payload: { options: { corePath, logging }
} }
}; };
const FS = ({ workerId, payload: { method, args } }, res) => { const FS = async ({ workerId, payload: { method, args } }, res) => {
log(`[${workerId}]: FS.${method} with args ${args}`); log(`[${workerId}]: FS.${method} with args ${args}`);
res.resolve(TessModule.FS[method](...args)); res.resolve(TessModule.FS[method](...args));
}; };
@ -79,6 +81,7 @@ res) => {
? () => Promise.resolve() ? () => Promise.resolve()
: adapter.readCache; : adapter.readCache;
let data = null; let data = null;
let newData = false;
try { try {
const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`);
@ -90,6 +93,7 @@ res) => {
throw Error('Not found in cache'); throw Error('Not found in cache');
} }
} catch (e) { } catch (e) {
newData = true;
log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`); log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`);
if (typeof _lang === 'string') { if (typeof _lang === 'string') {
let path = null; let path = null;
@ -131,8 +135,13 @@ res) => {
TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data); TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data);
} }
if (['write', 'refresh', undefined].includes(cacheMethod)) { if (newData && ['write', 'refresh', undefined].includes(cacheMethod)) {
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data); try {
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data);
} catch (err) {
log(`[${workerId}]: Failed to write ${lang}.traineddata to cache due to error:`);
log(err.toString());
}
} }
return Promise.resolve(data); return Promise.resolve(data);
@ -148,7 +157,7 @@ res) => {
} }
}; };
const setParameters = ({ payload: { params: _params } }, res) => { const setParameters = async ({ payload: { params: _params } }, res) => {
Object.keys(_params) Object.keys(_params)
.filter((k) => !k.startsWith('tessjs_')) .filter((k) => !k.startsWith('tessjs_'))
.forEach((key) => { .forEach((key) => {
@ -161,9 +170,9 @@ const setParameters = ({ payload: { params: _params } }, res) => {
} }
}; };
const initialize = ({ const initialize = async ({
workerId, workerId,
payload: { langs: _langs, oem }, payload: { langs: _langs, oem, config },
}, res) => { }, res) => {
const langs = (typeof _langs === 'string') const langs = (typeof _langs === 'string')
? _langs ? _langs
@ -176,13 +185,27 @@ const initialize = ({
if (api !== null) { if (api !== null) {
api.End(); api.End();
} }
let configFile;
let configStr;
// config argument may either be config file text, or object with key/value pairs
// In the latter case we convert to config file text here
if (typeof config === 'object') {
configStr = JSON.stringify(config).replace(/,/g, '\n').replace(/:/g, ' ').replace(/["'{}]/g, '');
} else {
configStr = config;
}
if (typeof configStr === 'string') {
configFile = '/config';
TessModule.FS.writeFile(configFile, configStr);
}
api = new TessModule.TessBaseAPI(); api = new TessModule.TessBaseAPI();
const status = api.Init(null, langs, oem); const status = api.Init(null, langs, oem);
if (status === -1) { if (status === -1) {
res.reject('initialization failed'); res.reject('initialization failed');
} }
params = defaultParams; params = defaultParams;
setParameters({ payload: { params } }); await setParameters({ payload: { params } });
res.progress({ res.progress({
workerId, status: 'initialized api', progress: 1, workerId, status: 'initialized api', progress: 1,
}); });
@ -192,46 +215,165 @@ const initialize = ({
} }
}; };
const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => { const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};
// Combines default output with user-specified options and
// counts (1) total output formats requested and (2) outputs that require OCR
const processOutput = (output) => {
const workingOutput = JSON.parse(JSON.stringify(defaultOutput));
// Output formats were set using `setParameters` in previous versions
// These settings are copied over for compatability
if (params.tessjs_create_box === '1') workingOutput.box = true;
if (params.tessjs_create_hocr === '1') workingOutput.hocr = true;
if (params.tessjs_create_osd === '1') workingOutput.osd = true;
if (params.tessjs_create_tsv === '1') workingOutput.tsv = true;
if (params.tessjs_create_unlv === '1') workingOutput.unlv = true;
const nonRecOutputs = ['imageColor', 'imageGrey', 'imageBinary'];
let recOutputCount = 0;
for (const prop of Object.keys(output)) {
workingOutput[prop] = output[prop];
}
for (const prop of Object.keys(workingOutput)) {
if (workingOutput[prop]) {
if (!nonRecOutputs.includes(prop)) {
recOutputCount += 1;
}
}
}
return { workingOutput, recOutputCount };
};
// List of options for Tesseract.js (rather than passed through to Tesseract),
// not including those with prefix "tessjs_"
const tessjsOptions = ['rectangle', 'pdfTitle', 'pdfTextOnly', 'rotateAuto', 'rotateRadians'];
const recognize = async ({
payload: {
image, options, output,
},
}, res) => {
try { try {
const ptr = setImage(TessModule, api, image); const optionsTess = {};
if (typeof options === 'object' && Object.keys(options).length > 0) {
// The options provided by users contain a mix of options for Tesseract.js
// and parameters passed through to Tesseract.
for (const param of Object.keys(options)) {
if (!param.startsWith('tessjs_') && !tessjsOptions.includes(param)) {
optionsTess[param] = options[param];
}
}
}
if (output.debug) {
optionsTess.debug_file = '/debugInternal.txt';
TessModule.FS.writeFile('/debugInternal.txt', '');
}
// If any parameters are changed here they are changed back at the end
if (Object.keys(optionsTess).length > 0) {
api.SaveParameters();
for (const prop of Object.keys(optionsTess)) {
api.SetVariable(prop, optionsTess[prop]);
}
}
const { workingOutput, recOutputCount } = processOutput(output);
// When the auto-rotate option is True, setImage is called with no angle,
// then the angle is calculated by Tesseract and then setImage is re-called.
// Otherwise, setImage is called once using the user-provided rotateRadiansFinal value.
let rotateRadiansFinal;
if (options.rotateAuto) {
// The angle is only detected if auto page segmentation is used
// Therefore, if this is not the mode specified by the user, it is enabled temporarily here
const psmInit = api.GetPageSegMode();
let psmEdit = false;
if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) {
psmEdit = true;
api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO));
}
setImage(TessModule, api, image);
api.FindLines();
const rotateRadiansCalc = api.GetAngle();
// Restore user-provided PSM setting
if (psmEdit) {
api.SetVariable('tessedit_pageseg_mode', String(psmInit));
}
// Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime
if (Math.abs(rotateRadiansCalc) >= 0.005) {
rotateRadiansFinal = rotateRadiansCalc;
setImage(TessModule, api, image, rotateRadiansFinal);
} else {
// Image needs to be reset if run with different PSM setting earlier
if (psmEdit) {
setImage(TessModule, api, image);
}
rotateRadiansFinal = 0;
}
} else {
rotateRadiansFinal = options.rotateRadians || 0;
setImage(TessModule, api, image, rotateRadiansFinal);
}
const rec = options.rectangle;
if (typeof rec === 'object') { if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height); api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
} }
api.Recognize(null);
res.resolve(dump(TessModule, api, params)); if (recOutputCount > 0) {
TessModule._free(ptr); api.Recognize(null);
} else {
log('Skipping recognition: all output options requiring recognition are disabled.');
}
const { pdfTitle } = options;
const { pdfTextOnly } = options;
const result = dump(TessModule, api, workingOutput, { pdfTitle, pdfTextOnly });
result.rotateRadians = rotateRadiansFinal;
if (output.debug) TessModule.FS.unlink('/debugInternal.txt');
if (Object.keys(optionsTess).length > 0) {
api.RestoreParameters();
}
res.resolve(result);
} catch (err) { } catch (err) {
res.reject(err.toString()); res.reject(err.toString());
} }
}; };
const getPDF = ({ payload: { title, textonly } }, res) => { const detect = async ({ payload: { image } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};
const detect = ({ payload: { image } }, res) => {
try { try {
const ptr = setImage(TessModule, api, image); setImage(TessModule, api, image);
const results = new TessModule.OSResults(); const results = new TessModule.OSResults();
if (!api.DetectOS(results)) { if (!api.DetectOS(results)) {
api.End(); res.resolve({
TessModule._free(ptr); tesseract_script_id: null,
res.reject('Failed to detect OS'); script: null,
script_confidence: null,
orientation_degrees: null,
orientation_confidence: null,
});
} else { } else {
const best = results.best_result; const best = results.best_result;
const oid = best.orientation_id; const oid = best.orientation_id;
const sid = best.script_id; const sid = best.script_id;
TessModule._free(ptr);
res.resolve({ res.resolve({
tesseract_script_id: sid, tesseract_script_id: sid,
script: results.unicharset.get_script_from_script_id(sid), script: results.unicharset.get_script_from_script_id(sid),
@ -245,7 +387,7 @@ const detect = ({ payload: { image } }, res) => {
} }
}; };
const terminate = (_, res) => { const terminate = async (_, res) => {
try { try {
if (api !== null) { if (api !== null) {
api.End(); api.End();
@ -282,22 +424,18 @@ exports.dispatchHandlers = (packet, send) => {
latestJob = res; latestJob = res;
try { ({
({ load,
load, FS,
FS, loadLanguage,
loadLanguage, initialize,
initialize, setParameters,
setParameters, recognize,
recognize, getPDF,
getPDF, detect,
detect, terminate,
terminate, })[packet.action](packet, res)
})[packet.action](packet, res); .catch((err) => res.reject(err.toString()));
} catch (err) {
/** Prepare exception to travel through postMessage */
res.reject(err.toString());
}
}; };
/** /**

@ -0,0 +1,56 @@
// Copied from https://gist.github.com/jonleighton/958841
// Copyright 2011 Jon Leighton, MIT LICENSE
/* eslint no-bitwise: 0 */
module.exports = (arrayBuffer) => {
let base64 = '';
const encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
const bytes = new Uint8Array(arrayBuffer);
const { byteLength } = bytes;
const byteRemainder = byteLength % 3;
const mainLength = byteLength - byteRemainder;
let a; let b; let c; let
d;
let chunk;
// Main loop deals with bytes in chunks of 3
for (let i = 0; i < mainLength; i += 3) {
// Combine the three bytes into a single integer
chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2];
// Use bitmasks to extract 6-bit segments from the triplet
a = (chunk & 16515072) >> 18; // 16515072 = (2^6 - 1) << 18
b = (chunk & 258048) >> 12; // 258048 = (2^6 - 1) << 12
c = (chunk & 4032) >> 6; // 4032 = (2^6 - 1) << 6
d = chunk & 63; // 63 = 2^6 - 1
// Convert the raw binary segments to the appropriate ASCII encoding
base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d];
}
// Deal with the remaining bytes and padding
if (byteRemainder === 1) {
chunk = bytes[mainLength];
a = (chunk & 252) >> 2; // 252 = (2^6 - 1) << 2
// Set the 4 least significant bits to zero
b = (chunk & 3) << 4; // 3 = 2^2 - 1
base64 += `${encodings[a] + encodings[b]}==`;
} else if (byteRemainder === 2) {
chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1];
a = (chunk & 64512) >> 10; // 64512 = (2^6 - 1) << 10
b = (chunk & 1008) >> 4; // 1008 = (2^6 - 1) << 4
// Set the 2 least significant bits to zero
c = (chunk & 15) << 2; // 15 = 2^4 - 1
base64 += `${encodings[a] + encodings[b] + encodings[c]}=`;
}
return base64;
};

@ -7,6 +7,8 @@
* @author Guillermo Webster <gui@mit.edu> * @author Guillermo Webster <gui@mit.edu>
* @author Jerome Wu <jeromewus@gmail.com> * @author Jerome Wu <jeromewus@gmail.com>
*/ */
const arrayBufferToBase64 = require('./arrayBufferToBase64');
const imageType = require('../../constants/imageType');
/** /**
* deindent * deindent
@ -37,13 +39,7 @@ const deindent = (html) => {
* @function dump recognition result to a JSON object * @function dump recognition result to a JSON object
* @access public * @access public
*/ */
module.exports = (TessModule, api, { module.exports = (TessModule, api, output, options) => {
tessjs_create_hocr,
tessjs_create_tsv,
tessjs_create_box,
tessjs_create_unlv,
tessjs_create_osd,
}) => {
const ri = api.GetIterator(); const ri = api.GetIterator();
const { const {
RIL_BLOCK, RIL_BLOCK,
@ -65,137 +61,162 @@ module.exports = (TessModule, api, {
.map((e) => e.slice(prefix.length + 1))[0] .map((e) => e.slice(prefix.length + 1))[0]
); );
ri.Begin(); const getImage = (type) => {
do { api.WriteImage(type, '/image.png');
if (ri.IsAtBeginningOf(RIL_BLOCK)) { const pngBuffer = TessModule.FS.readFile('/image.png');
const poly = ri.BlockPolygon(); const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`;
let polygon = null; TessModule.FS.unlink('/image.png');
// BlockPolygon() returns null when automatic page segmentation is off return pngStr;
if (TessModule.getPointer(poly) > 0) { };
const n = poly.get_n();
const px = poly.get_x(); const getPDFInternal = (title, textonly) => {
const py = poly.get_y(); const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
polygon = []; pdfRenderer.BeginDocument(title);
for (let i = 0; i < n; i += 1) { pdfRenderer.AddImage(api);
polygon.push([px.getValue(i), py.getValue(i)]); pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
if (output.blocks) {
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
} }
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
}
block = { block = {
paragraphs: [], paragraphs: [],
text: ri.GetUTF8Text(RIL_BLOCK), text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK), confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK), baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK), bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'), blocktype: enumToString(ri.BlockType(), 'PT'),
polygon, polygon,
}; };
blocks.push(block); blocks.push(block);
} }
if (ri.IsAtBeginningOf(RIL_PARA)) { if (ri.IsAtBeginningOf(RIL_PARA)) {
para = { para = {
lines: [], lines: [],
text: ri.GetUTF8Text(RIL_PARA), text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA), confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA), baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA), bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(), is_ltr: !!ri.ParagraphIsLtr(),
}; };
block.paragraphs.push(para); block.paragraphs.push(para);
} }
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
textline = { textline = {
words: [], words: [],
text: ri.GetUTF8Text(RIL_TEXTLINE), text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE), confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE), baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE), bbox: ri.getBoundingBox(RIL_TEXTLINE),
}; };
para.lines.push(textline); para.lines.push(textline);
} }
if (ri.IsAtBeginningOf(RIL_WORD)) { if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes(); const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection(); const wordDir = ri.WordDirection();
word = { word = {
symbols: [], symbols: [],
choices: [], choices: [],
text: ri.GetUTF8Text(RIL_WORD), text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD), confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD), baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD), bbox: ri.getBoundingBox(RIL_WORD),
is_numeric: !!ri.WordIsNumeric(), is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(), in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'), direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(), language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold, is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic, is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined, is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace, is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif, is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps, is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize, font_size: fontInfo.pointsize,
font_id: fontInfo.font_id, font_id: fontInfo.font_id,
font_name: fontInfo.font_name, font_name: fontInfo.font_name,
}; };
const wc = new TessModule.WordChoiceIterator(ri); const wc = new TessModule.WordChoiceIterator(ri);
do { do {
word.choices.push({ word.choices.push({
text: wc.GetUTF8Text(), text: wc.GetUTF8Text(),
confidence: wc.Confidence(), confidence: wc.Confidence(),
}); });
} while (wc.Next()); } while (wc.Next());
TessModule.destroy(wc); TessModule.destroy(wc);
textline.words.push(word); textline.words.push(word);
} }
// let image = null; // let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix); // var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics // // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix)); // TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) { if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = { symbol = {
choices: [], choices: [],
image: null, image: null,
text: ri.GetUTF8Text(RIL_SYMBOL), text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL), confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL), baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL), bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(), is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(), is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(), is_dropcap: !!ri.SymbolIsDropcap(),
}; };
word.symbols.push(symbol); word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri); const ci = new TessModule.ChoiceIterator(ri);
do { do {
symbol.choices.push({ symbol.choices.push({
text: ci.GetUTF8Text(), text: ci.GetUTF8Text(),
confidence: ci.Confidence(), confidence: ci.Confidence(),
}); });
} while (ci.Next()); } while (ci.Next());
// TessModule.destroy(i); // TessModule.destroy(i);
} }
} while (ri.Next(RIL_SYMBOL)); } while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri); TessModule.destroy(ri);
}
return { return {
text: api.GetUTF8Text(), text: output.text ? api.GetUTF8Text() : null,
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null, tsv: output.tsv ? api.GetTSVText() : null,
box: tessjs_create_box === '1' ? api.GetBoxText() : null, box: output.box ? api.GetBoxText() : null,
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null, unlv: output.unlv ? api.GetUNLVText() : null,
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null, osd: output.osd ? api.GetOsdText() : null,
pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null,
imageColor: output.imageColor ? getImage(imageType.COLOR) : null,
imageGrey: output.imageGrey ? getImage(imageType.GREY) : null,
imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null,
confidence: api.MeanTextConf(), confidence: api.MeanTextConf(),
blocks, blocks: output.blocks ? blocks : null,
psm: enumToString(api.GetPageSegMode(), 'PSM'), psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'), oem: enumToString(api.oem(), 'OEM'),
version: api.Version(), version: api.Version(),
debug: output.debug ? TessModule.FS.readFile('/debugInternal.txt', { encoding: 'utf8', flags: 'a+' }) : null,
}; };
}; };

@ -8,56 +8,24 @@ const fileType = require('file-type');
* @function set image in tesseract for recognition * @function set image in tesseract for recognition
* @access public * @access public
*/ */
module.exports = (TessModule, api, image) => { module.exports = (TessModule, api, image, angle = 0) => {
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length })); const type = fileType(image);
const type = fileType(buf);
let bytesPerPixel = 0;
let data = null;
let pix = null;
let w = 0;
let h = 0;
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1; const exif = image.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/* // /*
* Leptonica supports uncompressed but not compressed bmp files // * Leptonica supports some but not all bmp files
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516 // * @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
* We therefore use bmp-js to process all bmp files // * We therefore use bmp-js to convert all bmp files into a format Leptonica is known to support
*/ // */
if (type && type.mime === 'image/bmp') { if (type && type.mime === 'image/bmp') {
// Not sure what this line actually does, but removing breaks the function
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length }));
const bmpBuf = bmp.decode(buf); const bmpBuf = bmp.decode(buf);
data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT); TessModule.FS.writeFile('/input', bmp.encode(bmpBuf).data);
TessModule.HEAPU8.set(bmpBuf.data, data);
w = bmpBuf.width;
h = bmpBuf.height;
bytesPerPixel = 4;
} else { } else {
const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT); TessModule.FS.writeFile('/input', image);
TessModule.HEAPU8.set(buf, ptr);
pix = TessModule._pixReadMem(ptr, buf.length);
if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) {
/*
* Set a yres default value to prevent warning from tesseract
* See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h
*/
TessModule.setValue(pix + (7 * 4), 300, 'i32');
}
[w, h] = Array(2).fill(0)
.map((v, idx) => (
TessModule.getValue(pix + (idx * 4), 'i32')
));
} }
/* api.SetImageFile(exif, angle);
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix, undefined, undefined, undefined, undefined, exif);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif);
}
return data === null ? pix : data;
}; };

@ -1,9 +1,9 @@
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
const FS_WAIT = 500; const FS_WAIT = 500;
const worker = createWorker(OPTIONS); let worker;
before(function cb() { before(async function cb() {
this.timeout(0); this.timeout(0);
return worker.load(); worker = await createWorker(OPTIONS);
}); });
describe('FS', async () => { describe('FS', async () => {

@ -1,8 +1,8 @@
const { createWorker } = Tesseract; const { createWorker } = Tesseract;
const worker = createWorker(OPTIONS); let worker;
before(function cb() { before(async function cb() {
this.timeout(0); this.timeout(0);
return worker.load(); worker = await createWorker(OPTIONS);
}); });
describe('detect()', async () => { describe('detect()', async () => {
@ -17,3 +17,15 @@ describe('detect()', async () => {
}); });
}).timeout(TIMEOUT); }).timeout(TIMEOUT);
}); });
describe('detect()', async () => {
it('should detect OSD (simplified interface)', () => {
[
{ name: 'cosmic.png', ans: { script: 'Latin' } },
].forEach(async ({ name, ans: { script } }) => {
const { data: { script: s } } = await Tesseract.detect(`${IMAGE_PATH}/${name}`, undefined, OPTIONS);
expect(s).to.be(script);
});
}).timeout(TIMEOUT);
});

@ -1,8 +1,8 @@
const { createWorker, PSM } = Tesseract; const { createWorker, PSM } = Tesseract;
const worker = createWorker(OPTIONS); let worker;
before(async function cb() { before(async function cb() {
this.timeout(0); this.timeout(0);
await worker.load(); worker = await createWorker(OPTIONS);
await worker.loadLanguage('eng+chi_tra+osd'); await worker.loadLanguage('eng+chi_tra+osd');
}); });
@ -30,6 +30,19 @@ describe('recognize()', () => {
)); ));
}); });
describe('should recognize base64 image (simplified interface)', () => {
[
{ format: 'png', image: SIMPLE_PNG_BASE64, ans: SIMPLE_TEXT },
{ format: 'jpg', image: SIMPLE_JPG_BASE64, ans: SIMPLE_TEXT },
].forEach(({ format, image, ans }) => (
it(`recongize ${format} in base64`, async () => {
const { data: { text } } = await Tesseract.recognize(image, undefined, OPTIONS);
expect(text).to.be(ans);
}).timeout(TIMEOUT)
));
});
describe('should recognize different langs', () => { describe('should recognize different langs', () => {
[ [
{ name: 'chinese.png', lang: 'chi_tra', ans: CHINESE_TEXT }, { name: 'chinese.png', lang: 'chi_tra', ans: CHINESE_TEXT },

@ -7,8 +7,7 @@ before(async function cb() {
const NUM_WORKERS = 5; const NUM_WORKERS = 5;
console.log(`Initializing ${NUM_WORKERS} workers`); console.log(`Initializing ${NUM_WORKERS} workers`);
workers = await Promise.all(Array(NUM_WORKERS).fill(0).map(async () => { workers = await Promise.all(Array(NUM_WORKERS).fill(0).map(async () => {
const w = createWorker(OPTIONS); const w = await createWorker(OPTIONS);
await w.load();
await w.loadLanguage('eng'); await w.loadLanguage('eng');
await w.initialize('eng'); await w.initialize('eng');
return w; return w;

Loading…
Cancel
Save