feat(ocr): Tesseract.js as default scanner, AI as opt-in per port
The mobile receipt scanner now runs Tesseract.js in-browser by default — on-device, free, and image bytes never leave the device. AI providers (OpenAI / Claude) become a per-port opt-in for higher accuracy on hard-to-read receipts. - Lazy-load Tesseract WASM in src/lib/ocr/tesseract-client.ts (5 MB bundle dynamic-imports on first scan, not in main chunk) - Heuristic parser src/lib/ocr/parse-receipt-text.ts extracts vendor, date, amount, currency, and line items from raw OCR text - New port-scoped aiEnabled flag on OcrConfig (defaults false). Resolved flag never inherits from the global row — each port admin opts in independently - Scan endpoint short-circuits to manual-mode when aiEnabled=false so the AI provider is never invoked unless the admin has flipped the switch - Scan UI runs Tesseract first, then asks the server whether AI is enabled — uses the AI result only when its confidence beats Tesseract; network failures degrade gracefully to the local parse - Admin OCR-settings form gains the per-port aiEnabled checkbox Tests: 756/756 vitest (was 747) — +7 parser unit tests, +2 aiEnabled config tests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -85,6 +85,7 @@
|
|||||||
"sonner": "^1.7.0",
|
"sonner": "^1.7.0",
|
||||||
"tailwind-merge": "^2.6.0",
|
"tailwind-merge": "^2.6.0",
|
||||||
"tailwindcss-animate": "^1.0.7",
|
"tailwindcss-animate": "^1.0.7",
|
||||||
|
"tesseract.js": "^7.0.0",
|
||||||
"zod": "^3.24.0",
|
"zod": "^3.24.0",
|
||||||
"zustand": "^5.0.0"
|
"zustand": "^5.0.0"
|
||||||
},
|
},
|
||||||
|
|||||||
92
pnpm-lock.yaml
generated
92
pnpm-lock.yaml
generated
@@ -200,6 +200,9 @@ importers:
|
|||||||
tailwindcss-animate:
|
tailwindcss-animate:
|
||||||
specifier: ^1.0.7
|
specifier: ^1.0.7
|
||||||
version: 1.0.7(tailwindcss@3.4.19(tsx@4.21.0)(yaml@2.8.2))
|
version: 1.0.7(tailwindcss@3.4.19(tsx@4.21.0)(yaml@2.8.2))
|
||||||
|
tesseract.js:
|
||||||
|
specifier: ^7.0.0
|
||||||
|
version: 7.0.0
|
||||||
zod:
|
zod:
|
||||||
specifier: ^3.24.0
|
specifier: ^3.24.0
|
||||||
version: 3.25.76
|
version: 3.25.76
|
||||||
@@ -2763,6 +2766,9 @@ packages:
|
|||||||
block-stream2@2.1.0:
|
block-stream2@2.1.0:
|
||||||
resolution: {integrity: sha512-suhjmLI57Ewpmq00qaygS8UgEq2ly2PCItenIyhMqVjo4t4pGzqMvfgJuX8iWTeSDdfSSqS6j38fL4ToNL7Pfg==}
|
resolution: {integrity: sha512-suhjmLI57Ewpmq00qaygS8UgEq2ly2PCItenIyhMqVjo4t4pGzqMvfgJuX8iWTeSDdfSSqS6j38fL4ToNL7Pfg==}
|
||||||
|
|
||||||
|
bmp-js@0.1.0:
|
||||||
|
resolution: {integrity: sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==}
|
||||||
|
|
||||||
brace-expansion@1.1.12:
|
brace-expansion@1.1.12:
|
||||||
resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==}
|
resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==}
|
||||||
|
|
||||||
@@ -3708,6 +3714,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==}
|
resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==}
|
||||||
engines: {node: '>=0.10.0'}
|
engines: {node: '>=0.10.0'}
|
||||||
|
|
||||||
|
idb-keyval@6.2.2:
|
||||||
|
resolution: {integrity: sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==}
|
||||||
|
|
||||||
ieee754@1.2.1:
|
ieee754@1.2.1:
|
||||||
resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
|
resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
|
||||||
|
|
||||||
@@ -3877,6 +3886,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==}
|
resolution: {integrity: sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
|
|
||||||
|
is-url@1.2.4:
|
||||||
|
resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==}
|
||||||
|
|
||||||
is-weakmap@2.0.2:
|
is-weakmap@2.0.2:
|
||||||
resolution: {integrity: sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==}
|
resolution: {integrity: sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
@@ -4289,6 +4301,15 @@ packages:
|
|||||||
resolution: {integrity: sha512-pyFS63ptit/P5WqUkt+UUfe+4oevH+bFeIiPPdfb0pFeYEu/1ELnJu5l+5EcTKYL5M7zaAa7S8ddywgXypqKCw==}
|
resolution: {integrity: sha512-pyFS63ptit/P5WqUkt+UUfe+4oevH+bFeIiPPdfb0pFeYEu/1ELnJu5l+5EcTKYL5M7zaAa7S8ddywgXypqKCw==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
|
|
||||||
|
node-fetch@2.7.0:
|
||||||
|
resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
|
||||||
|
engines: {node: 4.x || >=6.0.0}
|
||||||
|
peerDependencies:
|
||||||
|
encoding: ^0.1.0
|
||||||
|
peerDependenciesMeta:
|
||||||
|
encoding:
|
||||||
|
optional: true
|
||||||
|
|
||||||
node-gyp-build-optional-packages@5.2.2:
|
node-gyp-build-optional-packages@5.2.2:
|
||||||
resolution: {integrity: sha512-s+w+rBWnpTMwSFbaE0UXsRlg7hU4FjekKU4eyAih5T8nJuNZT1nNsskXpxmeqSK9UzkBl6UgRlnKc8hz8IEqOw==}
|
resolution: {integrity: sha512-s+w+rBWnpTMwSFbaE0UXsRlg7hU4FjekKU4eyAih5T8nJuNZT1nNsskXpxmeqSK9UzkBl6UgRlnKc8hz8IEqOw==}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
@@ -4384,6 +4405,10 @@ packages:
|
|||||||
zod:
|
zod:
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
|
opencollective-postinstall@2.0.3:
|
||||||
|
resolution: {integrity: sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
optionator@0.9.4:
|
optionator@0.9.4:
|
||||||
resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==}
|
resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==}
|
||||||
engines: {node: '>= 0.8.0'}
|
engines: {node: '>= 0.8.0'}
|
||||||
@@ -4970,6 +4995,9 @@ packages:
|
|||||||
regenerator-runtime@0.11.1:
|
regenerator-runtime@0.11.1:
|
||||||
resolution: {integrity: sha512-MguG95oij0fC3QV3URf4V2SDYGJhJnJGqvIIgdECeODCT98wSWDAJ94SSuVpYQUoTcGUIL6L4yNB7j1DFFHSBg==}
|
resolution: {integrity: sha512-MguG95oij0fC3QV3URf4V2SDYGJhJnJGqvIIgdECeODCT98wSWDAJ94SSuVpYQUoTcGUIL6L4yNB7j1DFFHSBg==}
|
||||||
|
|
||||||
|
regenerator-runtime@0.13.11:
|
||||||
|
resolution: {integrity: sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==}
|
||||||
|
|
||||||
regexp.prototype.flags@1.5.4:
|
regexp.prototype.flags@1.5.4:
|
||||||
resolution: {integrity: sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==}
|
resolution: {integrity: sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
@@ -5327,6 +5355,12 @@ packages:
|
|||||||
engines: {node: '>=14.0.0'}
|
engines: {node: '>=14.0.0'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
|
tesseract.js-core@7.0.0:
|
||||||
|
resolution: {integrity: sha512-WnNH518NzmbSq9zgTPeoF8c+xmilS8rFIl1YKbk/ptuuc7p6cLNELNuPAzcmsYw450ca6bLa8j3t0VAtq435Vw==}
|
||||||
|
|
||||||
|
tesseract.js@7.0.0:
|
||||||
|
resolution: {integrity: sha512-exPBkd+z+wM1BuMkx/Bjv43OeLBxhL5kKWsz/9JY+DXcXdiBjiAch0V49QR3oAJqCaL5qURE0vx9Eo+G5YE7mA==}
|
||||||
|
|
||||||
thenify-all@1.6.0:
|
thenify-all@1.6.0:
|
||||||
resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==}
|
resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==}
|
||||||
engines: {node: '>=0.8'}
|
engines: {node: '>=0.8'}
|
||||||
@@ -5383,6 +5417,9 @@ packages:
|
|||||||
toggle-selection@1.0.6:
|
toggle-selection@1.0.6:
|
||||||
resolution: {integrity: sha512-BiZS+C1OS8g/q2RRbJmy59xpyghNBqrr6k5L/uKBGRsTfxmu3ffiRnd8mlGPUVayg8pvfi5urfnu8TU7DVOkLQ==}
|
resolution: {integrity: sha512-BiZS+C1OS8g/q2RRbJmy59xpyghNBqrr6k5L/uKBGRsTfxmu3ffiRnd8mlGPUVayg8pvfi5urfnu8TU7DVOkLQ==}
|
||||||
|
|
||||||
|
tr46@0.0.3:
|
||||||
|
resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
|
||||||
|
|
||||||
tr46@5.1.1:
|
tr46@5.1.1:
|
||||||
resolution: {integrity: sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==}
|
resolution: {integrity: sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
@@ -5591,6 +5628,12 @@ packages:
|
|||||||
jsdom:
|
jsdom:
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
|
wasm-feature-detect@1.8.0:
|
||||||
|
resolution: {integrity: sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==}
|
||||||
|
|
||||||
|
webidl-conversions@3.0.1:
|
||||||
|
resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}
|
||||||
|
|
||||||
webidl-conversions@7.0.0:
|
webidl-conversions@7.0.0:
|
||||||
resolution: {integrity: sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==}
|
resolution: {integrity: sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==}
|
||||||
engines: {node: '>=12'}
|
engines: {node: '>=12'}
|
||||||
@@ -5599,6 +5642,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==}
|
resolution: {integrity: sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
|
|
||||||
|
whatwg-url@5.0.0:
|
||||||
|
resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==}
|
||||||
|
|
||||||
which-boxed-primitive@1.1.1:
|
which-boxed-primitive@1.1.1:
|
||||||
resolution: {integrity: sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==}
|
resolution: {integrity: sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==}
|
||||||
engines: {node: '>= 0.4'}
|
engines: {node: '>= 0.4'}
|
||||||
@@ -5674,6 +5720,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
|
resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
|
||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
|
|
||||||
|
zlibjs@0.3.1:
|
||||||
|
resolution: {integrity: sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==}
|
||||||
|
|
||||||
zod@3.25.76:
|
zod@3.25.76:
|
||||||
resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==}
|
resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==}
|
||||||
|
|
||||||
@@ -7869,6 +7918,8 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
readable-stream: 3.6.2
|
readable-stream: 3.6.2
|
||||||
|
|
||||||
|
bmp-js@0.1.0: {}
|
||||||
|
|
||||||
brace-expansion@1.1.12:
|
brace-expansion@1.1.12:
|
||||||
dependencies:
|
dependencies:
|
||||||
balanced-match: 1.0.2
|
balanced-match: 1.0.2
|
||||||
@@ -8987,6 +9038,8 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
safer-buffer: 2.1.2
|
safer-buffer: 2.1.2
|
||||||
|
|
||||||
|
idb-keyval@6.2.2: {}
|
||||||
|
|
||||||
ieee754@1.2.1: {}
|
ieee754@1.2.1: {}
|
||||||
|
|
||||||
ignore@5.3.2: {}
|
ignore@5.3.2: {}
|
||||||
@@ -9175,6 +9228,8 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
which-typed-array: 1.1.20
|
which-typed-array: 1.1.20
|
||||||
|
|
||||||
|
is-url@1.2.4: {}
|
||||||
|
|
||||||
is-weakmap@2.0.2: {}
|
is-weakmap@2.0.2: {}
|
||||||
|
|
||||||
is-weakref@1.1.1:
|
is-weakref@1.1.1:
|
||||||
@@ -9563,6 +9618,10 @@ snapshots:
|
|||||||
object.entries: 1.1.9
|
object.entries: 1.1.9
|
||||||
semver: 6.3.1
|
semver: 6.3.1
|
||||||
|
|
||||||
|
node-fetch@2.7.0:
|
||||||
|
dependencies:
|
||||||
|
whatwg-url: 5.0.0
|
||||||
|
|
||||||
node-gyp-build-optional-packages@5.2.2:
|
node-gyp-build-optional-packages@5.2.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
detect-libc: 2.1.2
|
detect-libc: 2.1.2
|
||||||
@@ -9651,6 +9710,8 @@ snapshots:
|
|||||||
ws: 8.18.3
|
ws: 8.18.3
|
||||||
zod: 3.25.76
|
zod: 3.25.76
|
||||||
|
|
||||||
|
opencollective-postinstall@2.0.3: {}
|
||||||
|
|
||||||
optionator@0.9.4:
|
optionator@0.9.4:
|
||||||
dependencies:
|
dependencies:
|
||||||
deep-is: 0.1.4
|
deep-is: 0.1.4
|
||||||
@@ -10365,6 +10426,8 @@ snapshots:
|
|||||||
|
|
||||||
regenerator-runtime@0.11.1: {}
|
regenerator-runtime@0.11.1: {}
|
||||||
|
|
||||||
|
regenerator-runtime@0.13.11: {}
|
||||||
|
|
||||||
regexp.prototype.flags@1.5.4:
|
regexp.prototype.flags@1.5.4:
|
||||||
dependencies:
|
dependencies:
|
||||||
call-bind: 1.0.8
|
call-bind: 1.0.8
|
||||||
@@ -10819,6 +10882,22 @@ snapshots:
|
|||||||
- tsx
|
- tsx
|
||||||
- yaml
|
- yaml
|
||||||
|
|
||||||
|
tesseract.js-core@7.0.0: {}
|
||||||
|
|
||||||
|
tesseract.js@7.0.0:
|
||||||
|
dependencies:
|
||||||
|
bmp-js: 0.1.0
|
||||||
|
idb-keyval: 6.2.2
|
||||||
|
is-url: 1.2.4
|
||||||
|
node-fetch: 2.7.0
|
||||||
|
opencollective-postinstall: 2.0.3
|
||||||
|
regenerator-runtime: 0.13.11
|
||||||
|
tesseract.js-core: 7.0.0
|
||||||
|
wasm-feature-detect: 1.8.0
|
||||||
|
zlibjs: 0.3.1
|
||||||
|
transitivePeerDependencies:
|
||||||
|
- encoding
|
||||||
|
|
||||||
thenify-all@1.6.0:
|
thenify-all@1.6.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
thenify: 3.3.1
|
thenify: 3.3.1
|
||||||
@@ -10866,6 +10945,8 @@ snapshots:
|
|||||||
|
|
||||||
toggle-selection@1.0.6: {}
|
toggle-selection@1.0.6: {}
|
||||||
|
|
||||||
|
tr46@0.0.3: {}
|
||||||
|
|
||||||
tr46@5.1.1:
|
tr46@5.1.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
punycode: 2.3.1
|
punycode: 2.3.1
|
||||||
@@ -11081,6 +11162,10 @@ snapshots:
|
|||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- msw
|
- msw
|
||||||
|
|
||||||
|
wasm-feature-detect@1.8.0: {}
|
||||||
|
|
||||||
|
webidl-conversions@3.0.1: {}
|
||||||
|
|
||||||
webidl-conversions@7.0.0: {}
|
webidl-conversions@7.0.0: {}
|
||||||
|
|
||||||
whatwg-url@14.2.0:
|
whatwg-url@14.2.0:
|
||||||
@@ -11088,6 +11173,11 @@ snapshots:
|
|||||||
tr46: 5.1.1
|
tr46: 5.1.1
|
||||||
webidl-conversions: 7.0.0
|
webidl-conversions: 7.0.0
|
||||||
|
|
||||||
|
whatwg-url@5.0.0:
|
||||||
|
dependencies:
|
||||||
|
tr46: 0.0.3
|
||||||
|
webidl-conversions: 3.0.1
|
||||||
|
|
||||||
which-boxed-primitive@1.1.1:
|
which-boxed-primitive@1.1.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
is-bigint: 1.1.0
|
is-bigint: 1.1.0
|
||||||
@@ -11167,6 +11257,8 @@ snapshots:
|
|||||||
|
|
||||||
yocto-queue@0.1.0: {}
|
yocto-queue@0.1.0: {}
|
||||||
|
|
||||||
|
zlibjs@0.3.1: {}
|
||||||
|
|
||||||
zod@3.25.76: {}
|
zod@3.25.76: {}
|
||||||
|
|
||||||
zod@4.3.6: {}
|
zod@4.3.6: {}
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ const saveSchema = z.object({
|
|||||||
apiKey: z.string().optional(),
|
apiKey: z.string().optional(),
|
||||||
clearApiKey: z.boolean().optional(),
|
clearApiKey: z.boolean().optional(),
|
||||||
useGlobal: z.boolean().optional(),
|
useGlobal: z.boolean().optional(),
|
||||||
|
aiEnabled: z.boolean().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export const GET = withAuth(async (req, ctx) => {
|
export const GET = withAuth(async (req, ctx) => {
|
||||||
@@ -51,6 +52,7 @@ export const PUT = withAuth(async (req, ctx) => {
|
|||||||
apiKey: body.apiKey,
|
apiKey: body.apiKey,
|
||||||
clearApiKey: body.clearApiKey,
|
clearApiKey: body.clearApiKey,
|
||||||
useGlobal: body.useGlobal,
|
useGlobal: body.useGlobal,
|
||||||
|
aiEnabled: body.aiEnabled,
|
||||||
},
|
},
|
||||||
ctx.userId,
|
ctx.userId,
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -27,9 +27,16 @@ export const POST = withAuth(
|
|||||||
const mimeType = file.type || 'image/jpeg';
|
const mimeType = file.type || 'image/jpeg';
|
||||||
|
|
||||||
const config = await getResolvedOcrConfig(ctx.portId);
|
const config = await getResolvedOcrConfig(ctx.portId);
|
||||||
|
// Tesseract.js (in-browser) is the default. The server only invokes
|
||||||
|
// an AI provider when (a) the port admin has flipped `aiEnabled` on
|
||||||
|
// and (b) a key resolves. Otherwise the client falls back to its
|
||||||
|
// local Tesseract result.
|
||||||
|
if (!config.aiEnabled) {
|
||||||
|
return NextResponse.json({
|
||||||
|
data: { parsed: EMPTY, source: 'manual', reason: 'ai-disabled' },
|
||||||
|
});
|
||||||
|
}
|
||||||
if (!config.apiKey) {
|
if (!config.apiKey) {
|
||||||
// Manual-entry path — no OCR configured. Frontend will show the
|
|
||||||
// verify form with empty fields so the user can fill it in.
|
|
||||||
return NextResponse.json({
|
return NextResponse.json({
|
||||||
data: { parsed: EMPTY, source: 'manual', reason: 'no-ocr-configured' },
|
data: { parsed: EMPTY, source: 'manual', reason: 'no-ocr-configured' },
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ interface ConfigResp {
|
|||||||
model: string;
|
model: string;
|
||||||
hasApiKey: boolean;
|
hasApiKey: boolean;
|
||||||
useGlobal: boolean;
|
useGlobal: boolean;
|
||||||
|
aiEnabled: boolean;
|
||||||
};
|
};
|
||||||
models: Record<Provider, string[]>;
|
models: Record<Provider, string[]>;
|
||||||
}
|
}
|
||||||
@@ -56,6 +57,7 @@ function SettingsBlock({ scope, title, description, showUseGlobal }: SettingsBlo
|
|||||||
const [apiKey, setApiKey] = useState('');
|
const [apiKey, setApiKey] = useState('');
|
||||||
const [showKey, setShowKey] = useState(false);
|
const [showKey, setShowKey] = useState(false);
|
||||||
const [useGlobal, setUseGlobal] = useState(false);
|
const [useGlobal, setUseGlobal] = useState(false);
|
||||||
|
const [aiEnabled, setAiEnabled] = useState(false);
|
||||||
const [testStatus, setTestStatus] = useState<null | { ok: true } | { ok: false; reason: string }>(
|
const [testStatus, setTestStatus] = useState<null | { ok: true } | { ok: false; reason: string }>(
|
||||||
null,
|
null,
|
||||||
);
|
);
|
||||||
@@ -65,6 +67,7 @@ function SettingsBlock({ scope, title, description, showUseGlobal }: SettingsBlo
|
|||||||
setProvider(data.data.provider);
|
setProvider(data.data.provider);
|
||||||
setModel(data.data.model);
|
setModel(data.data.model);
|
||||||
setUseGlobal(data.data.useGlobal);
|
setUseGlobal(data.data.useGlobal);
|
||||||
|
setAiEnabled(data.data.aiEnabled);
|
||||||
}, [data?.data]);
|
}, [data?.data]);
|
||||||
|
|
||||||
const save = useMutation({
|
const save = useMutation({
|
||||||
@@ -78,6 +81,7 @@ function SettingsBlock({ scope, title, description, showUseGlobal }: SettingsBlo
|
|||||||
apiKey: apiKey.length > 0 ? apiKey : undefined,
|
apiKey: apiKey.length > 0 ? apiKey : undefined,
|
||||||
clearApiKey: Boolean(clearApiKey),
|
clearApiKey: Boolean(clearApiKey),
|
||||||
useGlobal: scope === 'global' ? false : useGlobal,
|
useGlobal: scope === 'global' ? false : useGlobal,
|
||||||
|
aiEnabled: scope === 'global' ? false : aiEnabled,
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
onSuccess: () => {
|
onSuccess: () => {
|
||||||
@@ -143,6 +147,26 @@ function SettingsBlock({ scope, title, description, showUseGlobal }: SettingsBlo
|
|||||||
</div>
|
</div>
|
||||||
) : null}
|
) : null}
|
||||||
|
|
||||||
|
{scope === 'port' ? (
|
||||||
|
<div className="flex items-start gap-2 rounded-lg border border-border bg-muted/30 p-3">
|
||||||
|
<Checkbox
|
||||||
|
id={`aiEnabled-${scope}`}
|
||||||
|
checked={aiEnabled}
|
||||||
|
onCheckedChange={(v) => setAiEnabled(v === true)}
|
||||||
|
/>
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label htmlFor={`aiEnabled-${scope}`} className="text-sm font-medium">
|
||||||
|
Enable AI receipt parsing for this port
|
||||||
|
</Label>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Off by default. Receipts are read on-device using Tesseract.js — accurate enough for
|
||||||
|
most receipts and incurs no AI cost. Turning this on lets the configured provider
|
||||||
|
re-parse receipts server-side for higher accuracy on hard-to-read images.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : null}
|
||||||
|
|
||||||
<div className="grid grid-cols-1 gap-4 sm:grid-cols-2">
|
<div className="grid grid-cols-1 gap-4 sm:grid-cols-2">
|
||||||
<div className="space-y-1.5">
|
<div className="space-y-1.5">
|
||||||
<Label htmlFor={`provider-${scope}`}>Provider</Label>
|
<Label htmlFor={`provider-${scope}`}>Provider</Label>
|
||||||
@@ -267,14 +291,14 @@ export function OcrSettingsForm() {
|
|||||||
<PageHeader
|
<PageHeader
|
||||||
title="Receipt OCR"
|
title="Receipt OCR"
|
||||||
eyebrow="Admin"
|
eyebrow="Admin"
|
||||||
description="Configure the AI provider used to read receipts captured via the mobile scanner."
|
description="Receipts are scanned on-device by default. Optionally configure an AI provider for higher-accuracy parsing on tricky receipts."
|
||||||
variant="gradient"
|
variant="gradient"
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<SettingsBlock
|
<SettingsBlock
|
||||||
scope="port"
|
scope="port"
|
||||||
title="This port"
|
title="This port"
|
||||||
description="Provider and key used when staff at this port scan a receipt."
|
description="Optional AI provider for staff at this port. Tesseract.js handles all scans on-device until AI is enabled."
|
||||||
showUseGlobal
|
showUseGlobal
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import { useUIStore } from '@/stores/ui-store';
|
|||||||
import { apiFetch } from '@/lib/api/client';
|
import { apiFetch } from '@/lib/api/client';
|
||||||
import { cn } from '@/lib/utils';
|
import { cn } from '@/lib/utils';
|
||||||
import { EXPENSE_CATEGORIES, PAYMENT_METHODS } from '@/lib/constants';
|
import { EXPENSE_CATEGORIES, PAYMENT_METHODS } from '@/lib/constants';
|
||||||
|
import { runTesseract } from '@/lib/ocr/tesseract-client';
|
||||||
|
|
||||||
// ─── Types ────────────────────────────────────────────────────────────────────
|
// ─── Types ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -33,11 +34,11 @@ interface ParsedReceipt {
|
|||||||
|
|
||||||
type ScanState =
|
type ScanState =
|
||||||
| { kind: 'idle' }
|
| { kind: 'idle' }
|
||||||
| { kind: 'processing' }
|
| { kind: 'processing'; engine: 'tesseract' | 'ai' }
|
||||||
| {
|
| {
|
||||||
kind: 'verify';
|
kind: 'verify';
|
||||||
parsed: ParsedReceipt;
|
parsed: ParsedReceipt;
|
||||||
source: 'ai' | 'manual';
|
source: 'ai' | 'tesseract' | 'manual';
|
||||||
reason?: string;
|
reason?: string;
|
||||||
providerError?: string;
|
providerError?: string;
|
||||||
}
|
}
|
||||||
@@ -62,7 +63,7 @@ interface VerifyFormProps {
|
|||||||
parsed: ParsedReceipt;
|
parsed: ParsedReceipt;
|
||||||
imagePreview: string;
|
imagePreview: string;
|
||||||
imageFile: File;
|
imageFile: File;
|
||||||
source: 'ai' | 'manual';
|
source: 'ai' | 'tesseract' | 'manual';
|
||||||
reason?: string;
|
reason?: string;
|
||||||
providerError?: string;
|
providerError?: string;
|
||||||
onSubmit: (input: {
|
onSubmit: (input: {
|
||||||
@@ -86,7 +87,7 @@ function VerifyForm({
|
|||||||
imagePreview,
|
imagePreview,
|
||||||
imageFile,
|
imageFile,
|
||||||
source,
|
source,
|
||||||
reason,
|
reason: _reason,
|
||||||
providerError,
|
providerError,
|
||||||
onSubmit,
|
onSubmit,
|
||||||
onRetake,
|
onRetake,
|
||||||
@@ -100,30 +101,21 @@ function VerifyForm({
|
|||||||
const [paymentMethod, setPaymentMethod] = useState<string>('credit_card');
|
const [paymentMethod, setPaymentMethod] = useState<string>('credit_card');
|
||||||
const [description, setDescription] = useState('');
|
const [description, setDescription] = useState('');
|
||||||
|
|
||||||
const lowConfidence = source === 'ai' && parsed.confidence < 0.6;
|
const lowConfidence = source !== 'manual' && parsed.confidence < 0.6;
|
||||||
const noOcr = source === 'manual';
|
const noOcr = source === 'manual';
|
||||||
|
const engineLabel = source === 'ai' ? 'AI' : source === 'tesseract' ? 'on-device OCR' : 'manual';
|
||||||
|
|
||||||
const banner = noOcr ? (
|
const banner = noOcr ? (
|
||||||
<div className="flex items-start gap-2 rounded-lg border border-amber-300 bg-amber-50 px-3 py-2 text-sm text-amber-900">
|
<div className="flex items-start gap-2 rounded-lg border border-amber-300 bg-amber-50 px-3 py-2 text-sm text-amber-900">
|
||||||
<AlertTriangle className="mt-0.5 h-4 w-4 shrink-0" />
|
<AlertTriangle className="mt-0.5 h-4 w-4 shrink-0" />
|
||||||
<div>
|
<div>
|
||||||
{reason === 'no-ocr-configured' ? (
|
<p className="font-medium">Manual entry mode</p>
|
||||||
<>
|
<p className="text-xs mt-0.5">
|
||||||
<p className="font-medium">Manual entry mode</p>
|
{providerError
|
||||||
<p className="text-xs mt-0.5">
|
? `We couldn't read the receipt automatically: ${providerError}.`
|
||||||
No AI provider is configured for this port. Fill in the details below to save the
|
: "We couldn't read the receipt automatically."}{' '}
|
||||||
expense with the photo attached.
|
Fill in the details below to save the expense with the photo attached.
|
||||||
</p>
|
</p>
|
||||||
</>
|
|
||||||
) : (
|
|
||||||
<>
|
|
||||||
<p className="font-medium">We couldn't read the receipt automatically</p>
|
|
||||||
<p className="text-xs mt-0.5">
|
|
||||||
{providerError ? `Reason: ${providerError}.` : ''} Fill in the details below to save
|
|
||||||
the expense with the photo attached.
|
|
||||||
</p>
|
|
||||||
</>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
) : lowConfidence ? (
|
) : lowConfidence ? (
|
||||||
@@ -132,7 +124,7 @@ function VerifyForm({
|
|||||||
<div>
|
<div>
|
||||||
<p className="font-medium">Low-confidence read — please double-check the fields</p>
|
<p className="font-medium">Low-confidence read — please double-check the fields</p>
|
||||||
<p className="text-xs mt-0.5">
|
<p className="text-xs mt-0.5">
|
||||||
The AI returned a confidence of {Math.round(parsed.confidence * 100)}%.
|
{engineLabel} returned {Math.round(parsed.confidence * 100)}% confidence.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -141,7 +133,9 @@ function VerifyForm({
|
|||||||
<CheckCircle2 className="mt-0.5 h-4 w-4 shrink-0" />
|
<CheckCircle2 className="mt-0.5 h-4 w-4 shrink-0" />
|
||||||
<div>
|
<div>
|
||||||
<p className="font-medium">Receipt parsed — confirm the fields and save</p>
|
<p className="font-medium">Receipt parsed — confirm the fields and save</p>
|
||||||
<p className="text-xs mt-0.5">Confidence {Math.round(parsed.confidence * 100)}%.</p>
|
<p className="text-xs mt-0.5">
|
||||||
|
{engineLabel} · {Math.round(parsed.confidence * 100)}% confidence.
|
||||||
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
@@ -306,7 +300,38 @@ export function ScanShell() {
|
|||||||
async function handleFile(file: File) {
|
async function handleFile(file: File) {
|
||||||
if (imagePreview) URL.revokeObjectURL(imagePreview);
|
if (imagePreview) URL.revokeObjectURL(imagePreview);
|
||||||
setImagePreview(URL.createObjectURL(file));
|
setImagePreview(URL.createObjectURL(file));
|
||||||
setState({ kind: 'processing' });
|
setState({ kind: 'processing', engine: 'tesseract' });
|
||||||
|
|
||||||
|
// Always run Tesseract first — it's free, on-device, and gives us a
|
||||||
|
// baseline parse we can fall back to if the optional AI pass is off
|
||||||
|
// or fails. The WASM bundle dynamic-imports inside `runTesseract`.
|
||||||
|
let tesseract: Awaited<ReturnType<typeof runTesseract>> | null = null;
|
||||||
|
try {
|
||||||
|
tesseract = await runTesseract(file);
|
||||||
|
} catch (err) {
|
||||||
|
// Tesseract.js itself failed (corrupt image, OOM, etc). Don't bail —
|
||||||
|
// give the user the manual form so they can still save the expense.
|
||||||
|
setState({
|
||||||
|
kind: 'verify',
|
||||||
|
parsed: {
|
||||||
|
establishment: null,
|
||||||
|
date: null,
|
||||||
|
amount: null,
|
||||||
|
currency: null,
|
||||||
|
lineItems: [],
|
||||||
|
confidence: 0,
|
||||||
|
},
|
||||||
|
source: 'manual',
|
||||||
|
reason: 'tesseract-error',
|
||||||
|
providerError: err instanceof Error ? err.message : 'On-device OCR failed',
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now ask the server whether AI is enabled for this port. If it is,
|
||||||
|
// the server runs the configured provider and returns a richer parse;
|
||||||
|
// otherwise we keep the Tesseract result.
|
||||||
|
setState({ kind: 'processing', engine: 'ai' });
|
||||||
try {
|
try {
|
||||||
const fd = new FormData();
|
const fd = new FormData();
|
||||||
fd.append('file', file);
|
fd.append('file', file);
|
||||||
@@ -319,21 +344,38 @@ export function ScanShell() {
|
|||||||
credentials: 'include',
|
credentials: 'include',
|
||||||
headers,
|
headers,
|
||||||
});
|
});
|
||||||
if (!res.ok) {
|
if (!res.ok) throw new Error(`Server returned ${res.status}`);
|
||||||
throw new Error(`Server returned ${res.status}`);
|
|
||||||
}
|
|
||||||
const body = (await res.json()) as ScanResp;
|
const body = (await res.json()) as ScanResp;
|
||||||
|
|
||||||
|
if (body.data.source === 'ai' && body.data.parsed.confidence >= tesseract.parsed.confidence) {
|
||||||
|
// AI did at least as well as Tesseract — prefer its result.
|
||||||
|
setState({
|
||||||
|
kind: 'verify',
|
||||||
|
parsed: body.data.parsed,
|
||||||
|
source: 'ai',
|
||||||
|
reason: body.data.reason,
|
||||||
|
providerError: body.data.providerError,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Either AI is disabled (`source: 'manual', reason: 'ai-disabled'`),
|
||||||
|
// not configured, or it underperformed — fall back to Tesseract.
|
||||||
setState({
|
setState({
|
||||||
kind: 'verify',
|
kind: 'verify',
|
||||||
parsed: body.data.parsed,
|
parsed: tesseract.parsed,
|
||||||
source: body.data.source,
|
source: 'tesseract',
|
||||||
reason: body.data.reason,
|
reason: body.data.reason,
|
||||||
providerError: body.data.providerError,
|
providerError: body.data.providerError,
|
||||||
});
|
});
|
||||||
} catch (err) {
|
} catch {
|
||||||
|
// Server unreachable — still let the user verify with the Tesseract
|
||||||
|
// result and save the expense. We don't surface the network error
|
||||||
|
// because the local parse is usable.
|
||||||
setState({
|
setState({
|
||||||
kind: 'error',
|
kind: 'verify',
|
||||||
message: err instanceof Error ? err.message : 'Upload failed',
|
parsed: tesseract.parsed,
|
||||||
|
source: 'tesseract',
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -446,7 +488,9 @@ export function ScanShell() {
|
|||||||
{state.kind === 'processing' ? (
|
{state.kind === 'processing' ? (
|
||||||
<section className="flex flex-1 flex-col items-center justify-center gap-3 py-12">
|
<section className="flex flex-1 flex-col items-center justify-center gap-3 py-12">
|
||||||
<Loader2 className="h-10 w-10 animate-spin text-brand" />
|
<Loader2 className="h-10 w-10 animate-spin text-brand" />
|
||||||
<p className="text-sm text-muted-foreground">Reading receipt…</p>
|
<p className="text-sm text-muted-foreground">
|
||||||
|
{state.engine === 'tesseract' ? 'Reading on-device…' : 'Refining with AI…'}
|
||||||
|
</p>
|
||||||
</section>
|
</section>
|
||||||
) : null}
|
) : null}
|
||||||
|
|
||||||
|
|||||||
302
src/lib/ocr/parse-receipt-text.ts
Normal file
302
src/lib/ocr/parse-receipt-text.ts
Normal file
@@ -0,0 +1,302 @@
|
|||||||
|
/**
|
||||||
|
* Heuristic parser for raw OCR text from a receipt image.
|
||||||
|
*
|
||||||
|
* Tesseract returns plain text — we extract structured fields (vendor, date,
|
||||||
|
* amount, currency, line items) using regex/positional rules. The output
|
||||||
|
* matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to
|
||||||
|
* branch on which engine produced it.
|
||||||
|
*
|
||||||
|
* Confidence is computed from how many fields we managed to recover, scaled
|
||||||
|
* by Tesseract's own per-line confidence when provided.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers';
|
||||||
|
|
||||||
|
/** ISO 4217 codes we recognize, plus common symbol → ISO map. */
|
||||||
|
const CURRENCY_SYMBOLS: Record<string, string> = {
|
||||||
|
$: 'USD',
|
||||||
|
'€': 'EUR',
|
||||||
|
'£': 'GBP',
|
||||||
|
'¥': 'JPY',
|
||||||
|
'₣': 'CHF',
|
||||||
|
'₹': 'INR',
|
||||||
|
'₽': 'RUB',
|
||||||
|
'₱': 'PHP',
|
||||||
|
'₩': 'KRW',
|
||||||
|
};
|
||||||
|
|
||||||
|
const CURRENCY_CODES = new Set([
|
||||||
|
'USD',
|
||||||
|
'EUR',
|
||||||
|
'GBP',
|
||||||
|
'JPY',
|
||||||
|
'CHF',
|
||||||
|
'CAD',
|
||||||
|
'AUD',
|
||||||
|
'NZD',
|
||||||
|
'SEK',
|
||||||
|
'NOK',
|
||||||
|
'DKK',
|
||||||
|
'PLN',
|
||||||
|
'CZK',
|
||||||
|
'HUF',
|
||||||
|
'INR',
|
||||||
|
'CNY',
|
||||||
|
'HKD',
|
||||||
|
'SGD',
|
||||||
|
'AED',
|
||||||
|
'ILS',
|
||||||
|
'TRY',
|
||||||
|
'ZAR',
|
||||||
|
'BRL',
|
||||||
|
'MXN',
|
||||||
|
'RUB',
|
||||||
|
'KRW',
|
||||||
|
]);
|
||||||
|
|
||||||
|
/** Patterns we try in order; the first match wins. */
|
||||||
|
const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [
|
||||||
|
// ISO 2024-04-28
|
||||||
|
{
|
||||||
|
regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/,
|
||||||
|
build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!),
|
||||||
|
},
|
||||||
|
// 28/04/2024 or 28-04-2024 (DMY — common in EU)
|
||||||
|
{
|
||||||
|
regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/,
|
||||||
|
build: (m) => {
|
||||||
|
const d = m[1]!;
|
||||||
|
const mo = m[2]!;
|
||||||
|
const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
|
||||||
|
// We can't tell DMY from MDY; trust DMY which is more common globally
|
||||||
|
// and won't fail validation as long as month <= 12.
|
||||||
|
return normalizeDate(y, mo, d);
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// 28 Apr 2024 / 28-Apr-2024
|
||||||
|
{
|
||||||
|
regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i,
|
||||||
|
build: (m) => {
|
||||||
|
const months: Record<string, string> = {
|
||||||
|
jan: '01',
|
||||||
|
feb: '02',
|
||||||
|
mar: '03',
|
||||||
|
apr: '04',
|
||||||
|
may: '05',
|
||||||
|
jun: '06',
|
||||||
|
jul: '07',
|
||||||
|
aug: '08',
|
||||||
|
sep: '09',
|
||||||
|
oct: '10',
|
||||||
|
nov: '11',
|
||||||
|
dec: '12',
|
||||||
|
};
|
||||||
|
const mo = months[m[2]!.toLowerCase().slice(0, 3)];
|
||||||
|
if (!mo) return null;
|
||||||
|
const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
|
||||||
|
return normalizeDate(y, mo, m[1]!);
|
||||||
|
},
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
function normalizeDate(year: string, month: string, day: string): string | null {
|
||||||
|
const y = year.padStart(4, '0');
|
||||||
|
const m = month.padStart(2, '0');
|
||||||
|
const d = day.padStart(2, '0');
|
||||||
|
const candidate = `${y}-${m}-${d}`;
|
||||||
|
// Sanity-check by round-tripping through Date — drops invalid days.
|
||||||
|
const t = new Date(candidate);
|
||||||
|
if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null;
|
||||||
|
// Don't accept implausibly old or future-dated receipts.
|
||||||
|
const yr = Number(y);
|
||||||
|
if (yr < 2000 || yr > 2100) return null;
|
||||||
|
return candidate;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Pulls the first recognizable date out of `text`. */
|
||||||
|
function extractDate(text: string): string | null {
|
||||||
|
for (const { regex, build } of DATE_PATTERNS) {
|
||||||
|
const m = text.match(regex);
|
||||||
|
if (m) {
|
||||||
|
const d = build(m);
|
||||||
|
if (d) return d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */
|
||||||
|
function extractCurrency(text: string): string | null {
|
||||||
|
for (const sym of Object.keys(CURRENCY_SYMBOLS)) {
|
||||||
|
if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!;
|
||||||
|
}
|
||||||
|
// Match a stand-alone uppercase 3-letter token.
|
||||||
|
const m = text.match(/\b([A-Z]{3})\b/g);
|
||||||
|
if (m) {
|
||||||
|
for (const code of m) {
|
||||||
|
if (CURRENCY_CODES.has(code)) return code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the receipt total. Strategy:
|
||||||
|
* 1. Look for a line containing "total", "amount due", "grand total",
|
||||||
|
* "balance due", "to pay" — preferring the last match (subtotals
|
||||||
|
* come earlier on the receipt).
|
||||||
|
* 2. Fall back to the largest decimal number on the receipt.
|
||||||
|
*/
|
||||||
|
function extractAmount(lines: string[]): number | null {
|
||||||
|
const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i;
|
||||||
|
let best: { amount: number; priority: number } | null = null;
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (!totalMarker.test(line)) continue;
|
||||||
|
const numbers = extractNumbers(line);
|
||||||
|
if (numbers.length === 0) continue;
|
||||||
|
// Take the largest number on this line (subtotal+tax often appear before total).
|
||||||
|
const amt = Math.max(...numbers);
|
||||||
|
// Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent".
|
||||||
|
const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line)
|
||||||
|
? 2
|
||||||
|
: 1;
|
||||||
|
if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) {
|
||||||
|
best = { amount: amt, priority };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (best) return best.amount;
|
||||||
|
|
||||||
|
// Fallback: largest decimal on the whole receipt.
|
||||||
|
const all = lines.flatMap(extractNumbers);
|
||||||
|
if (all.length === 0) return null;
|
||||||
|
return Math.max(...all);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */
|
||||||
|
function extractNumbers(line: string): number[] {
|
||||||
|
const out: number[] = [];
|
||||||
|
const re = /(?<![A-Za-z0-9])-?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{1,2})?(?![A-Za-z0-9])/g;
|
||||||
|
for (const match of line.matchAll(re)) {
|
||||||
|
const raw = match[0];
|
||||||
|
const parsed = parseLocaleNumber(raw);
|
||||||
|
if (parsed != null && Math.abs(parsed) >= 0.01) out.push(parsed);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseLocaleNumber(raw: string): number | null {
|
||||||
|
// Decide whether `,` or `.` is the decimal separator by looking at the last one.
|
||||||
|
const lastComma = raw.lastIndexOf(',');
|
||||||
|
const lastDot = raw.lastIndexOf('.');
|
||||||
|
let cleaned: string;
|
||||||
|
if (lastComma === -1 && lastDot === -1) {
|
||||||
|
cleaned = raw;
|
||||||
|
} else if (lastComma > lastDot) {
|
||||||
|
// Comma is decimal: 1.234,56 → 1234.56
|
||||||
|
cleaned = raw.replace(/\./g, '').replace(',', '.');
|
||||||
|
} else {
|
||||||
|
// Dot is decimal: 1,234.56 → 1234.56
|
||||||
|
cleaned = raw.replace(/,/g, '');
|
||||||
|
}
|
||||||
|
const n = Number(cleaned);
|
||||||
|
return Number.isFinite(n) ? n : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Vendor heuristic: first non-blank line that isn't a date/number-only line
|
||||||
|
* and isn't shorter than 3 chars. Receipts almost always print the merchant
|
||||||
|
* name at the top.
|
||||||
|
*/
|
||||||
|
function extractVendor(lines: string[]): string | null {
|
||||||
|
for (const line of lines.slice(0, 6)) {
|
||||||
|
const trimmed = line.trim();
|
||||||
|
if (trimmed.length < 3) continue;
|
||||||
|
// Vendor lines must include at least two alphabetic characters — drops
|
||||||
|
// pure-punctuation noise like "@@@" and divider rows like "===".
|
||||||
|
if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue;
|
||||||
|
if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue;
|
||||||
|
if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue;
|
||||||
|
return trimmed.slice(0, 120);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Pulls line items: lines with both descriptive text and a trailing number. */
|
||||||
|
function extractLineItems(lines: string[]): ParsedReceiptLineItem[] {
|
||||||
|
const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i;
|
||||||
|
const out: ParsedReceiptLineItem[] = [];
|
||||||
|
for (const line of lines) {
|
||||||
|
if (skipMarker.test(line)) continue;
|
||||||
|
// Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels.
|
||||||
|
if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue;
|
||||||
|
if (
|
||||||
|
/^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line)
|
||||||
|
) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Skip lines that look like an address: leading street number, common suffixes.
|
||||||
|
if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const numbers = extractNumbers(line);
|
||||||
|
if (numbers.length === 0) continue;
|
||||||
|
// Line items always have the price at the END; if the only number is at
|
||||||
|
// the start (e.g. street number), this isn't a line item.
|
||||||
|
const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line);
|
||||||
|
if (!trailingNumber) continue;
|
||||||
|
const lastNum = numbers[numbers.length - 1]!;
|
||||||
|
const numStr = String(lastNum);
|
||||||
|
const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match
|
||||||
|
const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, ''))
|
||||||
|
.trim()
|
||||||
|
.replace(/[.\-–—\s]+$/, '');
|
||||||
|
if (description.length < 2) continue;
|
||||||
|
out.push({ description: description.slice(0, 120), amount: lastNum });
|
||||||
|
if (out.length >= 20) break;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Confidence = fraction of headline fields recovered, scaled by avg
|
||||||
|
* Tesseract per-line confidence (1 if not provided).
|
||||||
|
*/
|
||||||
|
function computeConfidence(
|
||||||
|
fields: { vendor: unknown; date: unknown; amount: unknown },
|
||||||
|
ocrConfidence: number | null,
|
||||||
|
): number {
|
||||||
|
const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length;
|
||||||
|
const fieldScore = recovered / 3;
|
||||||
|
const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100));
|
||||||
|
return Number((fieldScore * ocrScore).toFixed(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ParseReceiptInput {
|
||||||
|
text: string;
|
||||||
|
/** 0–100 from Tesseract, or null if we don't have it. */
|
||||||
|
ocrConfidence?: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt {
|
||||||
|
const lines = text
|
||||||
|
.split(/\r?\n/)
|
||||||
|
.map((l) => l.trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
|
||||||
|
const vendor = extractVendor(lines);
|
||||||
|
const date = extractDate(text);
|
||||||
|
const amount = extractAmount(lines);
|
||||||
|
const currency = extractCurrency(text);
|
||||||
|
const lineItems = extractLineItems(lines);
|
||||||
|
const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence);
|
||||||
|
|
||||||
|
return {
|
||||||
|
establishment: vendor,
|
||||||
|
date,
|
||||||
|
amount,
|
||||||
|
currency,
|
||||||
|
lineItems,
|
||||||
|
confidence,
|
||||||
|
};
|
||||||
|
}
|
||||||
30
src/lib/ocr/tesseract-client.ts
Normal file
30
src/lib/ocr/tesseract-client.ts
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
/**
|
||||||
|
* Browser-only Tesseract.js wrapper. The WASM bundle is ~5 MB so we
|
||||||
|
* lazy-import on first use; subsequent scans reuse the cached module.
|
||||||
|
*
|
||||||
|
* Tesseract runs entirely in the browser — no image data leaves the
|
||||||
|
* user's device on this code path. AI providers (OpenAI/Claude) are
|
||||||
|
* a separate, opt-in path that runs server-side.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { ParsedReceipt } from '@/lib/services/ocr-providers';
|
||||||
|
import { parseReceiptText } from '@/lib/ocr/parse-receipt-text';
|
||||||
|
|
||||||
|
interface TesseractRunResult {
|
||||||
|
parsed: ParsedReceipt;
|
||||||
|
rawText: string;
|
||||||
|
/** 0–100 mean per-word confidence reported by Tesseract. */
|
||||||
|
confidence: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Lazy-imports tesseract.js and runs OCR on `file`. */
|
||||||
|
export async function runTesseract(file: File): Promise<TesseractRunResult> {
|
||||||
|
// Dynamic import — the ~5 MB tesseract bundle stays out of the main chunk.
|
||||||
|
const { recognize } = await import('tesseract.js');
|
||||||
|
|
||||||
|
const { data } = await recognize(file, 'eng');
|
||||||
|
const rawText = data.text ?? '';
|
||||||
|
const confidence = typeof data.confidence === 'number' ? data.confidence : 0;
|
||||||
|
const parsed = parseReceiptText({ text: rawText, ocrConfidence: confidence });
|
||||||
|
return { parsed, rawText, confidence };
|
||||||
|
}
|
||||||
@@ -30,6 +30,12 @@ export interface OcrConfigPublic {
|
|||||||
hasApiKey: boolean;
|
hasApiKey: boolean;
|
||||||
/** Port-level rows can opt into the global config. */
|
/** Port-level rows can opt into the global config. */
|
||||||
useGlobal: boolean;
|
useGlobal: boolean;
|
||||||
|
/**
|
||||||
|
* AI receipt parsing is opt-in per port. When false (the default),
|
||||||
|
* the scanner uses the in-browser Tesseract.js engine and the AI
|
||||||
|
* provider is never called even if a key is configured.
|
||||||
|
*/
|
||||||
|
aiEnabled: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Internal shape including the decrypted key — server-side only. */
|
/** Internal shape including the decrypted key — server-side only. */
|
||||||
@@ -44,6 +50,7 @@ interface StoredOcrConfig {
|
|||||||
model: string;
|
model: string;
|
||||||
apiKeyEncrypted: string | null;
|
apiKeyEncrypted: string | null;
|
||||||
useGlobal: boolean;
|
useGlobal: boolean;
|
||||||
|
aiEnabled?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
const KEY = 'ocr.config';
|
const KEY = 'ocr.config';
|
||||||
@@ -90,15 +97,20 @@ export async function getResolvedOcrConfig(portId: string): Promise<OcrConfigRes
|
|||||||
apiKey: null,
|
apiKey: null,
|
||||||
hasApiKey: false,
|
hasApiKey: false,
|
||||||
useGlobal: portRow?.useGlobal === true,
|
useGlobal: portRow?.useGlobal === true,
|
||||||
|
aiEnabled: false,
|
||||||
source: 'none',
|
source: 'none',
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
// The aiEnabled flag is per-port: even if the port falls back to a global
|
||||||
|
// key, the port admin still has to flip the switch on this port.
|
||||||
|
const aiEnabled = portRow?.aiEnabled === true;
|
||||||
return {
|
return {
|
||||||
provider: sourceRow.provider,
|
provider: sourceRow.provider,
|
||||||
model: sourceRow.model,
|
model: sourceRow.model,
|
||||||
apiKey: sourceRow.apiKeyEncrypted ? decrypt(sourceRow.apiKeyEncrypted) : null,
|
apiKey: sourceRow.apiKeyEncrypted ? decrypt(sourceRow.apiKeyEncrypted) : null,
|
||||||
hasApiKey: Boolean(sourceRow.apiKeyEncrypted),
|
hasApiKey: Boolean(sourceRow.apiKeyEncrypted),
|
||||||
useGlobal: portRow?.useGlobal === true,
|
useGlobal: portRow?.useGlobal === true,
|
||||||
|
aiEnabled,
|
||||||
source: useGlobal ? 'global' : 'port',
|
source: useGlobal ? 'global' : 'port',
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -112,6 +124,7 @@ export async function getPublicOcrConfig(portId: string | null): Promise<OcrConf
|
|||||||
model: DEFAULT_MODEL.openai,
|
model: DEFAULT_MODEL.openai,
|
||||||
hasApiKey: false,
|
hasApiKey: false,
|
||||||
useGlobal: false,
|
useGlobal: false,
|
||||||
|
aiEnabled: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
@@ -119,6 +132,7 @@ export async function getPublicOcrConfig(portId: string | null): Promise<OcrConf
|
|||||||
model: row.model,
|
model: row.model,
|
||||||
hasApiKey: Boolean(row.apiKeyEncrypted),
|
hasApiKey: Boolean(row.apiKeyEncrypted),
|
||||||
useGlobal: row.useGlobal,
|
useGlobal: row.useGlobal,
|
||||||
|
aiEnabled: row.aiEnabled === true,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,6 +144,8 @@ export interface SaveOcrConfigInput {
|
|||||||
/** When true, clears the stored key. */
|
/** When true, clears the stored key. */
|
||||||
clearApiKey?: boolean;
|
clearApiKey?: boolean;
|
||||||
useGlobal?: boolean;
|
useGlobal?: boolean;
|
||||||
|
/** Per-port toggle: enable AI receipt parsing. Defaults to false. */
|
||||||
|
aiEnabled?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function saveOcrConfig(
|
export async function saveOcrConfig(
|
||||||
@@ -144,6 +160,9 @@ export async function saveOcrConfig(
|
|||||||
} else if (input.apiKey !== undefined && input.apiKey.length > 0) {
|
} else if (input.apiKey !== undefined && input.apiKey.length > 0) {
|
||||||
apiKeyEncrypted = encrypt(input.apiKey);
|
apiKeyEncrypted = encrypt(input.apiKey);
|
||||||
}
|
}
|
||||||
|
// AI is meaningful only at the port scope. Preserve the existing flag if the
|
||||||
|
// caller didn't pass one (so toggling provider/model doesn't re-disable AI).
|
||||||
|
const aiEnabled = portId === null ? false : (input.aiEnabled ?? existing?.aiEnabled ?? false);
|
||||||
await writeRow(
|
await writeRow(
|
||||||
portId,
|
portId,
|
||||||
{
|
{
|
||||||
@@ -151,6 +170,7 @@ export async function saveOcrConfig(
|
|||||||
model: input.model,
|
model: input.model,
|
||||||
apiKeyEncrypted,
|
apiKeyEncrypted,
|
||||||
useGlobal: portId === null ? false : Boolean(input.useGlobal),
|
useGlobal: portId === null ? false : Boolean(input.useGlobal),
|
||||||
|
aiEnabled,
|
||||||
},
|
},
|
||||||
userId,
|
userId,
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -115,6 +115,38 @@ describe('OCR config', () => {
|
|||||||
expect(resolved.model).toBe('gpt-4o');
|
expect(resolved.model).toBe('gpt-4o');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('aiEnabled defaults to false and round-trips when toggled', async () => {
|
||||||
|
const port = await makePort();
|
||||||
|
await saveOcrConfig(
|
||||||
|
port.id,
|
||||||
|
{ provider: 'openai', model: 'gpt-4o-mini', apiKey: 'sk-x' },
|
||||||
|
'user-1',
|
||||||
|
);
|
||||||
|
let resolved = await getResolvedOcrConfig(port.id);
|
||||||
|
expect(resolved.aiEnabled).toBe(false);
|
||||||
|
|
||||||
|
await saveOcrConfig(
|
||||||
|
port.id,
|
||||||
|
{ provider: 'openai', model: 'gpt-4o-mini', aiEnabled: true },
|
||||||
|
'user-1',
|
||||||
|
);
|
||||||
|
resolved = await getResolvedOcrConfig(port.id);
|
||||||
|
expect(resolved.aiEnabled).toBe(true);
|
||||||
|
expect(resolved.apiKey).toBe('sk-x'); // not wiped by the toggle
|
||||||
|
});
|
||||||
|
|
||||||
|
it('aiEnabled is forced false at global scope', async () => {
|
||||||
|
await saveOcrConfig(
|
||||||
|
null,
|
||||||
|
{ provider: 'openai', model: 'gpt-4o-mini', apiKey: 'g', aiEnabled: true },
|
||||||
|
'user-1',
|
||||||
|
);
|
||||||
|
const port = await makePort();
|
||||||
|
const resolved = await getResolvedOcrConfig(port.id);
|
||||||
|
// Resolved AI flag is per-port, not inherited from global.
|
||||||
|
expect(resolved.aiEnabled).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
it('global rows force useGlobal=false on save (not meaningful at global scope)', async () => {
|
it('global rows force useGlobal=false on save (not meaningful at global scope)', async () => {
|
||||||
await saveOcrConfig(
|
await saveOcrConfig(
|
||||||
null,
|
null,
|
||||||
|
|||||||
101
tests/unit/ocr/parse-receipt-text.test.ts
Normal file
101
tests/unit/ocr/parse-receipt-text.test.ts
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
|
||||||
|
import { parseReceiptText } from '@/lib/ocr/parse-receipt-text';
|
||||||
|
|
||||||
|
describe('parseReceiptText', () => {
|
||||||
|
it('extracts vendor, date, total, currency, and line items from a basic English receipt', () => {
|
||||||
|
const text = `
|
||||||
|
Marina Fuel Station
|
||||||
|
123 Harbor Way
|
||||||
|
Anguilla
|
||||||
|
|
||||||
|
Date: 2026-04-28
|
||||||
|
|
||||||
|
Diesel 45.00
|
||||||
|
Pump Fee 5.00
|
||||||
|
Subtotal 50.00
|
||||||
|
Tax 5.00
|
||||||
|
TOTAL 55.00
|
||||||
|
|
||||||
|
Thank you!
|
||||||
|
`;
|
||||||
|
const r = parseReceiptText({ text, ocrConfidence: 92 });
|
||||||
|
expect(r.establishment).toBe('Marina Fuel Station');
|
||||||
|
expect(r.date).toBe('2026-04-28');
|
||||||
|
expect(r.amount).toBe(55);
|
||||||
|
expect(r.lineItems.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(r.lineItems[0]!.description).toMatch(/diesel/i);
|
||||||
|
expect(r.confidence).toBeGreaterThan(0.5);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses European date and comma-decimal amount', () => {
|
||||||
|
const text = `
|
||||||
|
Boulangerie du Port
|
||||||
|
Rue de la Marina
|
||||||
|
|
||||||
|
28/04/2026
|
||||||
|
|
||||||
|
Pain 3,50
|
||||||
|
Café 2,50
|
||||||
|
|
||||||
|
Total: 6,00 €
|
||||||
|
`;
|
||||||
|
const r = parseReceiptText({ text });
|
||||||
|
expect(r.date).toBe('2026-04-28');
|
||||||
|
expect(r.amount).toBe(6);
|
||||||
|
expect(r.currency).toBe('EUR');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles ISO currency codes when no symbol is present', () => {
|
||||||
|
const text = `
|
||||||
|
Receipt
|
||||||
|
Acme Co.
|
||||||
|
Total 199.00 USD
|
||||||
|
`;
|
||||||
|
const r = parseReceiptText({ text });
|
||||||
|
expect(r.currency).toBe('USD');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null fields and zero confidence when nothing parseable', () => {
|
||||||
|
const r = parseReceiptText({ text: '@@@\n!!!\n###' });
|
||||||
|
expect(r.establishment).toBeNull();
|
||||||
|
expect(r.amount).toBeNull();
|
||||||
|
expect(r.date).toBeNull();
|
||||||
|
expect(r.confidence).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('prefers grand total over subtotal even when subtotal is named "total"', () => {
|
||||||
|
const text = `
|
||||||
|
Vendor X
|
||||||
|
|
||||||
|
Item A 10.00
|
||||||
|
Item B 20.00
|
||||||
|
Subtotal 30.00
|
||||||
|
Tax 3.00
|
||||||
|
Grand Total 33.00
|
||||||
|
`;
|
||||||
|
const r = parseReceiptText({ text });
|
||||||
|
expect(r.amount).toBe(33);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('skips obvious total/subtotal lines when extracting line items', () => {
|
||||||
|
const text = `
|
||||||
|
Boutique
|
||||||
|
Shirt 25.00
|
||||||
|
Tie 15.00
|
||||||
|
Subtotal 40.00
|
||||||
|
Tax 4.00
|
||||||
|
Total 44.00
|
||||||
|
`;
|
||||||
|
const r = parseReceiptText({ text });
|
||||||
|
const descriptions = r.lineItems.map((li) => li.description.toLowerCase());
|
||||||
|
expect(descriptions.some((d) => d.includes('subtotal'))).toBe(false);
|
||||||
|
expect(descriptions.some((d) => d.includes('total'))).toBe(false);
|
||||||
|
expect(descriptions.some((d) => d.includes('tax'))).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('rejects implausible dates', () => {
|
||||||
|
const r = parseReceiptText({ text: 'Random 1899-04-12 noise' });
|
||||||
|
expect(r.date).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user