From a470b836d5d27e0e426beea0afdcde0e14f1af5a Mon Sep 17 00:00:00 2001 From: Jerome Wu Date: Wed, 2 Oct 2019 16:53:25 +0800 Subject: [PATCH] Update tests --- package.json | 3 +- scripts/server.js | 2 +- scripts/test-helper.js | 5 + src/createJob.js | 18 +- src/createScheduler.js | 37 +++- src/createWorker.js | 74 +++++-- src/utils/getId.js | 3 + src/worker-script/index.js | 13 +- src/worker/browser/defaultOptions.js | 4 +- src/worker/browser/defaultOptions.js~ | 18 ++ src/worker/browser/loadImage.js | 4 +- tests/constants.js | 33 +++ tests/detect.test.html | 3 +- tests/detect.test.js | 46 ++-- tests/recognize.test.html | 3 +- tests/recognize.test.js | 296 +++++++++----------------- tests/scheduler.test.html | 18 ++ tests/scheduler.test.js | 35 +++ 18 files changed, 341 insertions(+), 274 deletions(-) create mode 100644 src/utils/getId.js create mode 100644 src/worker/browser/defaultOptions.js~ create mode 100644 tests/constants.js create mode 100644 tests/scheduler.test.html create mode 100644 tests/scheduler.test.js diff --git a/package.json b/package.json index 6d93d3c..2a7c124 100644 --- a/package.json +++ b/package.json @@ -10,13 +10,14 @@ "start": "node scripts/server.js", "build": "rimraf dist && webpack --config scripts/webpack.config.prod.js", "prepublishOnly": "npm run build", - "wait": "wait-on http://localhost:3000/package.json", + "wait": "rimraf dist && wait-on http://localhost:3000/dist/tesseract.dev.js", "test": "npm-run-all -p -r start test:all", "test:all": "npm-run-all wait test:browser:* test:node", "test:node": "nyc mocha --exit --bail --require ./scripts/test-helper.js ./tests/*.test.js", "test:browser-tpl": "mocha-headless-chrome -a incognito -a no-sandbox -a disable-setuid-sandbox -t 300000", "test:browser:detect": "npm run test:browser-tpl -- -f ./tests/detect.test.html", "test:browser:recognize": "npm run test:browser-tpl -- -f ./tests/recognize.test.html", + "test:browser:scheduler": "npm run test:browser-tpl -- -f ./tests/scheduler.test.html", "lint": "eslint src", "postinstall": "opencollective-postinstall || true" }, diff --git a/scripts/server.js b/scripts/server.js index a9c5b02..4ccc4b5 100644 --- a/scripts/server.js +++ b/scripts/server.js @@ -10,7 +10,7 @@ const app = express(); app.use(cors()); app.use('/', express.static(path.resolve(__dirname, '..'))); -app.use(middleware(compiler, { publicPath: '/dist' })); +app.use(middleware(compiler, { publicPath: '/dist', writeToDisk: true })); module.exports = app.listen(3000, () => { console.log('Server is running on port 3000'); diff --git a/scripts/test-helper.js b/scripts/test-helper.js index f164d7d..027b300 100644 --- a/scripts/test-helper.js +++ b/scripts/test-helper.js @@ -1,4 +1,9 @@ +const constants = require('../tests/constants'); global.expect = require('expect.js'); global.fs = require('fs'); global.path = require('path'); global.Tesseract = require('../src'); + +Object.keys(constants).forEach((key) => { + global[key] = constants[key]; +}); diff --git a/src/createJob.js b/src/createJob.js index 258c6dc..f90364d 100644 --- a/src/createJob.js +++ b/src/createJob.js @@ -1,11 +1,17 @@ -let jobCounter = 1; +const getId = require('./utils/getId'); -module.exports = ( +let jobCounter = 0; + +module.exports = ({ + id: _id, action, - payload, -) => { - const id = `Job-${jobCounter}-${Math.random().toString(16).slice(3, 8)}`; - jobCounter += 1; + payload = {}, +}) => { + let id = _id; + if (typeof id === 'undefined') { + id = getId('Job', jobCounter); + jobCounter += 1; + } return { id, diff --git a/src/createScheduler.js b/src/createScheduler.js index dcc1569..c2003f4 100644 --- a/src/createScheduler.js +++ b/src/createScheduler.js @@ -1,8 +1,20 @@ +const createJob = require('./createJob'); +const log = require('./utils/log'); +const getId = require('./utils/getId'); + +let schedulerCounter = 0; + module.exports = () => { + const id = getId('Scheduler', schedulerCounter); const workers = {}; const runningWorkers = {}; let jobQueue = []; + schedulerCounter += 1; + + const getQueueLen = () => jobQueue.length; + const getNumWorkers = () => Object.keys(workers).length; + const dequeue = () => { if (jobQueue.length !== 0) { const wIds = Object.keys(workers); @@ -17,11 +29,12 @@ module.exports = () => { const queue = (action, payload) => ( new Promise((resolve, reject) => { + const job = createJob({ action, payload }); jobQueue.push(async (w) => { jobQueue.shift(); - runningWorkers[w.id] = true; + runningWorkers[w.id] = job; try { - resolve(await w[action].apply(this, payload)); + resolve(await w[action].apply(this, [...payload, job.id])); } catch (err) { reject(err); } finally { @@ -29,22 +42,30 @@ module.exports = () => { dequeue(); } }); + log(`[${id}]: add ${job.id} to JobQueue`); + log(`[${id}]: JobQueue length=${jobQueue.length}`); dequeue(); }) ); const addWorker = (w) => { workers[w.id] = w; + log(`[${id}]: add ${w.id}`); + log(`[${id}]: number of workers=${getNumWorkers()}`); + dequeue(); return w.id; }; - const addJob = (action, ...payload) => ( - queue(action, payload) - ); + const addJob = async (action, ...payload) => { + if (getNumWorkers() === 0) { + throw Error(`[${id}]: You need to have at least one worker before adding jobs`); + } + return queue(action, payload); + }; const terminate = async () => { - Object.keys(workers).forEach(async (id) => { - await workers[id].terminate(); + Object.keys(workers).forEach(async (wid) => { + await workers[wid].terminate(); }); jobQueue = []; }; @@ -53,5 +74,7 @@ module.exports = () => { addWorker, addJob, terminate, + getQueueLen, + getNumWorkers, }; }; diff --git a/src/createWorker.js b/src/createWorker.js index 8f16a42..0689757 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -2,6 +2,7 @@ const resolvePaths = require('./utils/resolvePaths'); const circularize = require('./utils/circularize'); const createJob = require('./createJob'); const log = require('./utils/log'); +const getId = require('./utils/getId'); const { defaultOEM } = require('./constants/config'); const { defaultOptions, @@ -12,11 +13,10 @@ const { send, } = require('./worker/node'); -let workerCounter = 1; +let workerCounter = 0; module.exports = (_options = {}) => { - const id = `Worker-${workerCounter}-${Math.random().toString(16).slice(3, 8)}`; - workerCounter += 1; + const id = getId('Worker', workerCounter); const { logger, ...options @@ -28,6 +28,8 @@ module.exports = (_options = {}) => { const rejects = {}; let worker = spawnWorker(options); + workerCounter += 1; + const setResolve = (action, res) => { resolves[action] = res; }; @@ -36,10 +38,9 @@ module.exports = (_options = {}) => { rejects[action] = rej; }; - const startJob = (action, payload = {}) => ( + const startJob = ({ id: jobId, action, payload }) => ( new Promise((resolve, reject) => { - const { id: jobId } = createJob(action, payload); - log(`[${id}]: Start ${jobId}, action=${action}`); + log(`[${id}]: Start ${jobId}, action=${action}, payload=`, payload); setResolve(action, resolve); setReject(action, reject); send(worker, { @@ -51,32 +52,58 @@ module.exports = (_options = {}) => { }) ); - const load = () => ( - startJob('load', { options }) + const load = jobId => ( + startJob(createJob({ + id: jobId, action: 'load', payload: { options }, + })) ); - const loadLanguage = (langs = 'eng') => ( - startJob('loadLanguage', { langs, options }) + const loadLanguage = (langs = 'eng', jobId) => ( + startJob(createJob({ + id: jobId, + action: 'loadLanguage', + payload: { langs, options }, + })) ); - const initialize = (langs = 'eng', oem = defaultOEM) => ( - startJob('initialize', { langs, oem }) + const initialize = (langs = 'eng', oem = defaultOEM, jobId) => ( + startJob(createJob({ + id: jobId, + action: 'initialize', + payload: { langs, oem }, + })) ); - const setParameters = (params = {}) => ( - startJob('setParameters', { params }) + const setParameters = (params = {}, jobId) => ( + startJob(createJob({ + id: jobId, + action: 'setParameters', + payload: { params }, + })) ); - const recognize = async (image, opts = {}) => ( - startJob('recognize', { image: await loadImage(image), options: opts }) + const recognize = async (image, opts = {}, jobId) => ( + startJob(createJob({ + id: jobId, + action: 'recognize', + payload: { image: await loadImage(image), options: opts }, + })) ); - const getPDF = (title = 'Tesseract OCR Result', textonly = false) => ( - startJob('getPDF', { title, textonly }) + const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => ( + startJob(createJob({ + id: jobId, + action: 'getPDF', + payload: { title, textonly }, + })) ); - const detect = async image => ( - startJob('detect', { image: await loadImage(image) }) + const detect = async (image, jobId) => ( + startJob(createJob({ + id: jobId, + action: 'detect', + payload: { image: await loadImage(image) }, + })) ); const terminate = async () => { @@ -88,15 +115,18 @@ module.exports = (_options = {}) => { return Promise.resolve(); }; - onMessage(worker, ({ status, action, data }) => { + onMessage(worker, ({ + workerId, jobId, status, action, data, + }) => { if (status === 'resolve') { + log(`[${workerId}]: Complete ${jobId}, data=`, data); let d = data; if (action === 'recognize') { d = circularize(data); } else if (action === 'getPDF') { d = Array.from({ ...data, length: Object.keys(data).length }); } - resolves[action](d); + resolves[action]({ jobId, data: d }); } else if (status === 'reject') { rejects[action](data); throw Error(data); diff --git a/src/utils/getId.js b/src/utils/getId.js new file mode 100644 index 0000000..8c2b119 --- /dev/null +++ b/src/utils/getId.js @@ -0,0 +1,3 @@ +module.exports = (prefix, cnt) => ( + `${prefix}-${cnt}-${Math.random().toString(16).slice(3, 8)}` +); diff --git a/src/worker-script/index.js b/src/worker-script/index.js index b72a28b..621638b 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -15,6 +15,7 @@ const dump = require('./utils/dump'); const isBrowser = require('../utils/getEnvironment')('type') === 'browser'; const setImage = require('./utils/setImage'); const defaultParams = require('./constants/defaultParams'); +const log = require('../utils/log'); /* * Tesseract Module returned by TesseractCore. @@ -23,7 +24,7 @@ let TessModule; /* * TessearctBaseAPI instance */ -let api; +let api = null; let latestJob; let adapter = {}; let params = defaultParams; @@ -77,11 +78,13 @@ const loadLanguage = async ({ try { const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); if (typeof _data !== 'undefined') { + log(`[${workerId}]: Load ${lang}.traineddata from cache`); data = _data; } else { throw Error('Not found in cache'); } } catch (e) { + log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`); if (typeof _lang === 'string') { let path = null; @@ -173,8 +176,12 @@ const initialize = ({ res.progress({ workerId, status: 'initializing api', progress: 0, }); + if (api !== null) { + api.End(); + } api = new TessModule.TessBaseAPI(); api.Init(null, langs, oem); + params = defaultParams; setParameters({ payload: { params } }); res.progress({ workerId, status: 'initialized api', progress: 1, @@ -242,7 +249,9 @@ const detect = ({ payload: { image } }, res) => { const terminate = (_, res) => { try { - api.End(); + if (api !== null) { + api.End(); + } res.resolve({ terminated: true }); } catch (err) { res.reject(err.toString()); diff --git a/src/worker/browser/defaultOptions.js b/src/worker/browser/defaultOptions.js index 64d0851..cef5e58 100644 --- a/src/worker/browser/defaultOptions.js +++ b/src/worker/browser/defaultOptions.js @@ -1,5 +1,5 @@ const resolveURL = require('resolve-url'); -const { version } = require('../../../package.json'); +const { version, dependencies } = require('../../../package.json'); const defaultOptions = require('../../constants/defaultOptions'); /* @@ -14,5 +14,5 @@ module.exports = { * If browser doesn't support WebAssembly, * load ASM version instead */ - corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, + corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, }; diff --git a/src/worker/browser/defaultOptions.js~ b/src/worker/browser/defaultOptions.js~ new file mode 100644 index 0000000..cef5e58 --- /dev/null +++ b/src/worker/browser/defaultOptions.js~ @@ -0,0 +1,18 @@ +const resolveURL = require('resolve-url'); +const { version, dependencies } = require('../../../package.json'); +const defaultOptions = require('../../constants/defaultOptions'); + +/* + * Default options for browser worker + */ +module.exports = { + ...defaultOptions, + workerPath: (typeof process !== 'undefined' && process.env.TESS_ENV === 'development') + ? resolveURL(`/dist/worker.dev.js?nocache=${Math.random().toString(36).slice(3)}`) + : `https://unpkg.com/tesseract.js@v${version}/dist/worker.min.js`, + /* + * If browser doesn't support WebAssembly, + * load ASM version instead + */ + corePath: `https://unpkg.com/tesseract.js-core@v${dependencies['tesseract.js-core'].substring(1)}/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, +}; diff --git a/src/worker/browser/loadImage.js b/src/worker/browser/loadImage.js index a57b4f4..b849153 100644 --- a/src/worker/browser/loadImage.js +++ b/src/worker/browser/loadImage.js @@ -56,10 +56,10 @@ const loadImage = async (image) => { } } else if (image instanceof HTMLElement) { if (image.tagName === 'IMG') { - data = loadImage(image.src); + data = await loadImage(image.src); } if (image.tagName === 'VIDEO') { - data = loadImage(image.poster); + data = await loadImage(image.poster); } if (image.tagName === 'CANVAS') { await new Promise((resolve) => { diff --git a/tests/constants.js b/tests/constants.js new file mode 100644 index 0000000..e65f493 --- /dev/null +++ b/tests/constants.js @@ -0,0 +1,33 @@ +const TIMEOUT = 10000; +const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; +const IS_BROWSER = typeof window !== 'undefined' && typeof window.document !== 'undefined'; +const OPTIONS = { + langPath: 'http://localhost:3000/tests/assets/traineddata', + cachePath: './tests/assets/traineddata', + ...(IS_BROWSER ? { workerPath: '/dist/worker.dev.js' } : {}), +}; +const SIMPLE_TEXT = 'Tesseract.js\n'; +const SIMPLE_TEXT_HALF = 'Tesse\n'; +const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; +const TESTOCR_TEXT = 'This is a lot of 12 point text to test the\nocr code and see if it works on all types\nof file format.\n\nThe quick brown dog jumped over the\nlazy fox. The quick brown dog jumped\nover the lazy fox. The quick brown dog\njumped over the lazy fox. The quick\nbrown dog jumped over the lazy fox.\n'; +const CHINESE_TEXT = '繁 體 中 文 測 試\n'; +const FORMATS = ['png', 'jpg', 'bmp', 'pbm']; +const SIMPLE_PNG_BASE64 = ''; +const SIMPLE_JPG_BASE64 = ''; + +if (typeof module !== 'undefined') { + module.exports = { + TIMEOUT, + IMAGE_PATH, + IS_BROWSER, + SIMPLE_PNG_BASE64, + SIMPLE_JPG_BASE64, + CHINESE_TEXT, + SIMPLE_TEXT, + SIMPLE_TEXT_HALF, + COMSIC_TEXT, + TESTOCR_TEXT, + FORMATS, + OPTIONS, + }; +} diff --git a/tests/detect.test.html b/tests/detect.test.html index c119a6b..0206775 100644 --- a/tests/detect.test.html +++ b/tests/detect.test.html @@ -7,7 +7,8 @@
- + + - + + + + + + + + + + diff --git a/tests/scheduler.test.js b/tests/scheduler.test.js new file mode 100644 index 0000000..1bab4ca --- /dev/null +++ b/tests/scheduler.test.js @@ -0,0 +1,35 @@ +const { createScheduler, createWorker } = Tesseract; + +let workers = []; + +before(async function cb() { + this.timeout(0); + const NUM_WORKERS = 10; + console.log(`Initializing ${NUM_WORKERS} workers`); + workers = await Promise.all(Array(NUM_WORKERS).fill(0).map(async () => { + const w = createWorker(OPTIONS); + await w.load(); + await w.loadLanguage('eng'); + await w.initialize('eng'); + return w; + })); + console.log(`Initialized ${NUM_WORKERS} workers`); +}); + +describe('scheduler', () => { + describe('should speed up with more workers (running 20 jobs)', () => { + Array(10).fill(0).forEach((_, num) => ( + it(`support using ${num + 1} workers`, async () => { + const NUM_JOBS = 30; + const scheduler = createScheduler(); + workers.slice(0, num + 1).forEach((w) => { + scheduler.addWorker(w); + }); + const rets = await Promise.all(Array(NUM_JOBS).fill(0).map((_, idx) => ( + scheduler.addJob('recognize', `${IMAGE_PATH}/${idx % 2 === 0 ? 'simple' : 'cosmic'}.png`) + ))); + expect(rets.length).to.be(NUM_JOBS); + }).timeout(60000) + )); + }); +});