chore(autonomous-session): consolidate uncommitted work from prior session
Bundles the prior autonomous-session output that was sitting unstaged: - Em-dash sweep across src/ + tests/ (en-dash/em-dash to hyphen, ~2280 instances) - country-flag-icons rollout (CountryFlag component, replaces emoji glyphs that never rendered on Windows; lazy-loads the 3x2 SVG index as a single chunk after the per-subpath dynamic-import approach silently failed in webpack) - Admin IA Phase 1+2: 7-domain regroup, 41 to 38 pages, /admin/berths index, redirects (ocr to ai, reports to dashboard, invitations to users), docs/admin-ia-proposal.md - Per-template email tester (registry + endpoint + UI on Email admin page) - Cancel-document mode picker (delete-from-Documenso vs keep-for-audit) - Dashboard PDF report: 25 widgets, SVG charts, date-range picker, 11 resolvers - Customize-widgets per-region sortables at xl+ (charts/rails/feed); single flat sortable below xl when the layout stacks; per-viewport saved orders - Audit doc updates capturing each shipped item - Lint fixes: react-compiler immutability in DonutChart (reduce instead of let-reassign), set-state-in-effect disables in CountryFlag and UploadForSigning preview-bytes effect, unused 'confirm' destructures in interest contract + reservation tabs, unescaped apostrophe in test-template card copy
This commit is contained in:
@@ -1,19 +1,19 @@
|
||||
/**
|
||||
* Reverse parser for per-berth PDFs (Phase 6b — see plan §4.7b and §9.2).
|
||||
* Reverse parser for per-berth PDFs (Phase 6b - see plan §4.7b and §9.2).
|
||||
*
|
||||
* Three tiers, each falling back to the next:
|
||||
*
|
||||
* 1. AcroForm — read named text fields via pdf-lib. The sample
|
||||
* 1. AcroForm - read named text fields via pdf-lib. The sample
|
||||
* `Berth_Spec_Sheet_A1.pdf` has 0 AcroForm fields (designers export the
|
||||
* PDF flat), so this tier is built defensively for future templates that
|
||||
* may include named form fields. When fields exist, this is the highest-
|
||||
* confidence path because there's no OCR loss.
|
||||
*
|
||||
* 2. OCR — Tesseract.js extracts text from the page; positional/regex
|
||||
* 2. OCR - Tesseract.js extracts text from the page; positional/regex
|
||||
* heuristics keyed off the labels documented in §9.2 pull out values.
|
||||
* Returns per-field confidence scores.
|
||||
*
|
||||
* 3. AI fallback — gated on `getResolvedOcrConfig(...)` returning a usable
|
||||
* 3. AI fallback - gated on `getResolvedOcrConfig(...)` returning a usable
|
||||
* OpenAI/Claude config. Only invoked when OCR confidence is below
|
||||
* threshold for too many fields AND the rep opts in via the diff dialog.
|
||||
* A null `apiKey` causes this tier to return a clear "not configured"
|
||||
@@ -41,7 +41,7 @@ export interface ExtractedBerthFields {
|
||||
/** Water depth at the berth (separate from a vessel's max draft). */
|
||||
waterDepth?: number | null;
|
||||
waterDepthM?: number | null;
|
||||
/** Max draught of vessel — falls back to the berth's draft column. */
|
||||
/** Max draught of vessel - falls back to the berth's draft column. */
|
||||
draftFt?: number | null;
|
||||
draftM?: number | null;
|
||||
bowFacing?: string | null;
|
||||
@@ -73,11 +73,11 @@ export interface ParsedField<T = unknown> {
|
||||
|
||||
export interface ParseResult {
|
||||
engine: ParserEngine;
|
||||
/** Sparse — only fields the parser was able to extract. */
|
||||
/** Sparse - only fields the parser was able to extract. */
|
||||
fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
|
||||
/** Mean confidence across all extracted fields (0..1). */
|
||||
meanConfidence: number;
|
||||
/** Raw text the OCR or AI tier produced — useful for the diff dialog audit. */
|
||||
/** Raw text the OCR or AI tier produced - useful for the diff dialog audit. */
|
||||
rawText?: string;
|
||||
/** Set when a tier degraded; the API surface uses this to decide whether to
|
||||
* surface the "AI parse" button. */
|
||||
@@ -155,7 +155,7 @@ async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
|
||||
const name = field.getName().toLowerCase();
|
||||
const target = ACROFORM_FIELD_MAP[name];
|
||||
if (!target) continue;
|
||||
// pdf-lib doesn't expose a generic "get value" — narrow to text fields.
|
||||
// pdf-lib doesn't expose a generic "get value" - narrow to text fields.
|
||||
let raw: string | undefined;
|
||||
try {
|
||||
const tf = form.getTextField(field.getName());
|
||||
@@ -182,12 +182,12 @@ async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
|
||||
|
||||
/**
|
||||
* Tier-2 extracts text directly from the PDF via `unpdf` (a serverless-
|
||||
* friendly pdfjs wrapper). This works for text-PDFs — i.e. PDFs that
|
||||
* friendly pdfjs wrapper). This works for text-PDFs - i.e. PDFs that
|
||||
* contain real text streams, not scanned page images. Scanned/raster PDFs
|
||||
* land here with empty extracted text and fall through to the AI tier.
|
||||
*
|
||||
* The earlier design called for tesseract.js rasterization, but
|
||||
* `tesseract.recognize` doesn't accept a PDF buffer — it expects an image.
|
||||
* `tesseract.recognize` doesn't accept a PDF buffer - it expects an image.
|
||||
* That old code path silently failed at runtime; unpdf is the correct
|
||||
* primitive for "pull text out of a PDF on the server."
|
||||
*
|
||||
@@ -263,7 +263,7 @@ export function extractFromOcrText(rawText: string): {
|
||||
out.mooringNumber = { value: mooringMatch[1]!.toUpperCase(), confidence: 0.85, engine: 'ocr' };
|
||||
}
|
||||
|
||||
// Length / Width / Water Depth — `Label: <imperial> / <metric>` form.
|
||||
// Length / Width / Water Depth - `Label: <imperial> / <metric>` form.
|
||||
// Imperial may be `206' 8"` style; we capture the numeric prefix in feet
|
||||
// and parse the metric independently because they're rarely lossless.
|
||||
const dimensional = (
|
||||
@@ -287,7 +287,7 @@ export function extractFromOcrText(rawText: string): {
|
||||
}
|
||||
if (ft != null && Number.isFinite(meters) && Math.abs(ft * 0.3048 - meters) / meters > 0.01) {
|
||||
warnings.push(
|
||||
`${label}: imperial/metric mismatch — ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`,
|
||||
`${label}: imperial/metric mismatch - ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`,
|
||||
);
|
||||
}
|
||||
};
|
||||
@@ -394,7 +394,7 @@ async function tryOcr(buffer: Buffer, adapter?: OcrAdapter): Promise<ParseResult
|
||||
};
|
||||
}
|
||||
const { fields, warnings } = extractFromOcrText(result.text);
|
||||
// Tesseract gives 0..100; normalize to 0..1 and use it as a global floor —
|
||||
// Tesseract gives 0..100; normalize to 0..1 and use it as a global floor -
|
||||
// per-field confidence is set by the regex tier above.
|
||||
const floor = Math.max(0, Math.min(result.confidence, 100)) / 100;
|
||||
for (const key of Object.keys(fields) as Array<keyof ExtractedBerthFields>) {
|
||||
@@ -441,7 +441,7 @@ export interface ParseBerthPdfOptions {
|
||||
* returned result's `engine` field tells callers which tier produced the
|
||||
* fields (used by the reconcile-diff dialog to colour confidence chips).
|
||||
*
|
||||
* The AI tier is never invoked from this entry point — that's a separate
|
||||
* The AI tier is never invoked from this entry point - that's a separate
|
||||
* deliberate action triggered from the diff dialog so OPENAI_API_KEY isn't
|
||||
* spent on every upload.
|
||||
*/
|
||||
@@ -499,7 +499,7 @@ function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string
|
||||
return raw;
|
||||
}
|
||||
// Numeric columns: strip currency / unit suffixes and commas. Berth
|
||||
// dimensions / capacities / prices are all non-negative — reject
|
||||
// dimensions / capacities / prices are all non-negative - reject
|
||||
// negatives outright so an AcroForm with `length_ft="-50"` doesn't
|
||||
// poison the recommender feasibility filter when applied.
|
||||
const numeric = Number(raw.replace(/[^0-9.\-]/g, ''));
|
||||
|
||||
Reference in New Issue
Block a user