Added SIMD-detection when corePath is manually specified per #735 (#745)

master
Balearica 1 year ago committed by GitHub
parent a31deaabe1
commit dec363bda5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 10
      docs/local-installation.md
  2. 2
      examples/browser/basic-efficient.html
  3. 2
      examples/browser/basic.html
  4. 2
      examples/browser/demo.html
  5. 2
      examples/browser/download-pdf.html
  6. 20
      src/worker-script/browser/getCore.js

@ -12,7 +12,7 @@ In Node.js environment, the only path you may want to customize is languages/lan
Tesseract.recognize(image, langs, {
workerPath: 'https://cdn.jsdelivr.net/npm/tesseract.js@v4.0.3/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3/tesseract-core.wasm.js',
corePath: 'https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3',
})
```
@ -22,7 +22,7 @@ Or
const worker = await createWorker({
workerPath: 'https://cdn.jsdelivr.net/npm/tesseract.js@v4.0.3/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',
corePath: 'https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3/tesseract-core.wasm.js',
corePath: 'https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3',
});
```
@ -33,4 +33,8 @@ A string specifying the location of the `worker.js` file.
A string specifying the location of the tesseract language files, with default value 'https://tessdata.projectnaptha.com/4.0.0'. Language file URLs are calculated according to the formula `langPath + langCode + '.traineddata.gz'`.
### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3/tesseract-core.wasm.js'.
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3'.
When `corePath` is a directory rather than specific `.js` file (e.g. `https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3`), Tesseract.js loads either `tesseract-core-simd.wasm.js` or `tesseract-core.wasm.js` depending on whether the users' device supports SIMD (see [https://webassembly.org/roadmap/](https://webassembly.org/roadmap/)). Therefore, if self-hosting it is important that both these files are in the location you specify for `corePath`. Having multiple files is necessary as the SIMD-enabled version is *significantly* faster (for the LSTM model [the default]), however is not yet supported on all devices.
When `corePath` is set to a specific `.js` file (e.g. `https://cdn.jsdelivr.net/npm/tesseract.js-core@v4.0.3/tesseract-core.wasm.js`), it will load that file regardless of whether the users' device supports SIMD or not. This behavior exists to preserve backwards compatibility--specifying a directory that contains both files is strongly recommended. Specifying a single file will either result in much slower performance (if `tesseract-core.wasm.js` is specified) or failure to run on certain devices (if `tesseract-core-simd.wasm.js` is specified).

@ -12,7 +12,7 @@
// every time the user uploads a new file.
const worker = await Tesseract.createWorker({
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
corePath: '../../node_modules/tesseract.js-core',
workerPath: "/dist/worker.dev.js",
logger: function(m){console.log(m);}
});

@ -13,7 +13,7 @@
const recognize = async ({ target: { files } }) => {
const { data: { text } } = await Tesseract.recognize(files[0], 'eng', {
corePath: '../../node_modules/tesseract.js-core/tesseract-core.wasm.js',
corePath: '../../node_modules/tesseract.js-core',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m),
});

@ -39,7 +39,7 @@ function progressUpdate(packet){
async function recognizeFile(file) {
document.querySelector("#log").innerHTML = ''
const corePath = '../../node_modules/tesseract.js-core/tesseract-core.wasm.js';
const corePath = '../../node_modules/tesseract.js-core';
const lang = document.querySelector('#langsel').value
const data = await Tesseract.recognize(file, lang, {

@ -11,7 +11,7 @@
<script type="module">
const { createWorker } = Tesseract;
const worker = await createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
corePath: '/node_modules/tesseract.js-core',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m),
});

@ -6,19 +6,27 @@ module.exports = async (corePath, res) => {
res.progress({ status: 'loading tesseract core', progress: 0 });
// If the user specifies a core path, we use that
// Otherwise, we detect the correct core based on SIMD support
let corePathImport = corePath;
if (!corePathImport) {
// Otherwise, default to CDN
const corePathImport = corePath || `https://cdn.jsdelivr.net/npm/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}`;
// If a user specifies a specific JavaScript file, load that file.
// Otherwise, assume a directory has been provided, and load either
// tesseract-core.wasm.js or tesseract-core-simd.wasm.js depending
// on whether this device has SIMD support.
let corePathImportFile;
if (corePathImport.slice(-2) === 'js') {
corePathImportFile = corePathImport;
} else {
const simdSupport = await simd();
if (simdSupport) {
corePathImport = `https://cdn.jsdelivr.net/npm/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core-simd.wasm.js`;
corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd.wasm.js`;
} else {
corePathImport = `https://cdn.jsdelivr.net/npm/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.wasm.js`;
corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core.wasm.js`;
}
}
// Create a module named `global.TesseractCore`
global.importScripts(corePathImport);
global.importScripts(corePathImportFile);
// Tesseract.js-core versions through 4.0.3 create a module named `global.TesseractCoreWASM`,
// so we account for that here to preserve backwards compatibility.

Loading…
Cancel
Save