feat(migration): verification/audit script (PDF↔person + completeness)

Read-only audit of migrated data:
- EOI PDF ↔ person: extracts each attached signed-EOI PDF text (unpdf), confirms
  the linked client name appears, flags any PDF where a different client name
  appears. Result: 35/35 strong match, 0 mismatches (visually spot-checked 2).
- Berth PDF ↔ mooring: soft text check; moorings render as graphics so the
  filename→mooring attachment is authoritative (113/113; A1 visually confirmed).
- Per-person completeness: 0 deals missing stage, 0 clients without a deal,
  29 clients without contact info (inherited legacy data gaps).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-01 20:03:58 +02:00
parent 14ab8a8161
commit 3e47793ebe

View File

@@ -0,0 +1,210 @@
/**
* Migration verification / audit (read-only against the local dev DB + storage).
*
* 1. EOI PDF ↔ person: opens each attached signed-EOI PDF, extracts its text,
* and confirms the linked client's name actually appears inside — catching
* any wrong attachment from the name/fuzzy matcher. Flags any PDF where a
* *different* client's name appears instead.
* 2. Berth PDF ↔ mooring: confirms each berth's spec-sheet PDF mentions its
* mooring number.
* 3. Per-person completeness: clients missing contact info, deals missing a
* stage, clients with no deal, + a sample full dump to eyeball.
*
* pnpm tsx scripts/migration/verify-migration.ts [--port-slug port-nimara]
*/
import 'dotenv/config';
import { readFile } from 'node:fs/promises';
import path from 'node:path';
import { extractText, getDocumentProxy } from 'unpdf';
import { and, eq, isNotNull, sql } from 'drizzle-orm';
import { db, closeDb } from '@/lib/db';
import { ports } from '@/lib/db/schema/ports';
import { documents, files } from '@/lib/db/schema/documents';
import { clients } from '@/lib/db/schema/clients';
import { berths, berthPdfVersions } from '@/lib/db/schema/berths';
const STORAGE_ROOT = process.env.STORAGE_ROOT || 'storage';
const slugArg = (() => {
const i = process.argv.indexOf('--port-slug');
return i >= 0 ? (process.argv[i + 1] ?? 'port-nimara') : 'port-nimara';
})();
const norm = (s: string) =>
s
.toLowerCase()
.normalize('NFKD')
.replace(/[^a-z ]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
async function pdfText(storagePath: string): Promise<string> {
const buf = await readFile(path.join(STORAGE_ROOT, storagePath));
const pdf = await getDocumentProxy(new Uint8Array(buf));
const res = await extractText(pdf, { mergePages: true });
const t = Array.isArray(res.text) ? res.text.join(' ') : res.text;
return norm(t);
}
async function main() {
const [port] = await db
.select({ id: ports.id, slug: ports.slug })
.from(ports)
.where(eq(ports.slug, slugArg))
.limit(1);
if (!port) throw new Error(`no port ${slugArg}`);
const allNames = (
await db
.select({ id: clients.id, name: clients.fullName })
.from(clients)
.where(eq(clients.portId, port.id))
).map((c) => ({
id: c.id,
tokens: norm(c.name)
.split(' ')
.filter((t) => t.length >= 4),
name: c.name,
}));
// ── 1. EOI PDF ↔ person ──────────────────────────────────────────────────
const eoiRows = await db
.select({
docId: documents.id,
clientId: documents.clientId,
fullName: clients.fullName,
storagePath: files.storagePath,
})
.from(documents)
.innerJoin(files, eq(files.id, documents.signedFileId))
.innerJoin(clients, eq(clients.id, documents.clientId))
.where(
and(
eq(documents.portId, port.id),
eq(documents.documentType, 'eoi'),
isNotNull(documents.signedFileId),
),
);
console.log(`\n═══ 1. EOI PDF ↔ person (${eoiRows.length} attached signed EOIs) ═══`);
let ok = 0,
weak = 0,
bad = 0,
err = 0;
for (const r of eoiRows) {
try {
const text = await pdfText(r.storagePath);
const tokens = norm(r.fullName)
.split(' ')
.filter((t) => t.length >= 3);
const first = tokens[0];
const last = tokens[tokens.length - 1];
const hasFirst = !!first && text.includes(first);
const hasLast = !!last && text.includes(last);
if (hasFirst && hasLast) {
ok++;
} else if (hasFirst || hasLast) {
weak++;
console.log(
` ⚠ WEAK "${r.fullName}" — only ${hasLast ? 'surname' : 'first name'} found in its PDF`,
);
} else {
bad++;
const other = allNames.find(
(c) => c.id !== r.clientId && c.tokens.some((t) => text.includes(t)),
);
console.log(
` ✗ BAD "${r.fullName}" — name NOT in its PDF${other ? ` — but "${other.name}" DOES appear (likely mis-attached!)` : ''}`,
);
}
} catch (e) {
err++;
console.log(` ! ERR "${r.fullName}": ${(e as Error).message}`);
}
}
console.log(` → strong ${ok} · weak ${weak} · NO-match ${bad} · read-error ${err}`);
// ── 2. Berth PDF ↔ mooring ───────────────────────────────────────────────
const berthRows = await db
.select({ mooring: berths.mooringNumber, storageKey: berthPdfVersions.storageKey })
.from(berths)
.innerJoin(berthPdfVersions, eq(berthPdfVersions.id, berths.currentPdfVersionId))
.where(eq(berths.portId, port.id));
console.log(`\n═══ 2. Berth PDF ↔ mooring (${berthRows.length} berths with a PDF) ═══`);
let bOk = 0,
bBad = 0,
bErr = 0;
for (const r of berthRows) {
try {
const text = await pdfText(r.storageKey);
// mooring like "A1"/"D32" — match letter+space?+number loosely
const moo = r.mooring.toLowerCase();
const m = moo.match(/^([a-z]+)(\d+)$/);
const found =
text.includes(moo) ||
(m && text.includes(`${m[1]} ${m[2]}`)) ||
(m && new RegExp(`${m[1]}\\s*${m[2]}\\b`).test(text));
if (found) bOk++;
else {
bBad++;
console.log(` ✗ "${r.mooring}" mooring not found in its spec sheet`);
}
} catch (e) {
bErr++;
console.log(` ! ERR ${r.mooring}: ${(e as Error).message}`);
}
}
console.log(` → mooring-in-PDF ${bOk} · not-found ${bBad} · read-error ${bErr}`);
// ── 3. Per-person completeness ───────────────────────────────────────────
console.log(`\n═══ 3. Per-person data completeness (migrated clients) ═══`);
const noContact = await db.execute(sql`
select c.full_name from clients c
join migration_source_links l on l.target_entity_id=c.id and l.target_entity_type='client'
where not exists (select 1 from client_contacts cc where cc.client_id=c.id)`);
console.log(` clients with NO contact (email/phone): ${noContact.length}`);
for (const r of noContact.slice(0, 15))
console.log(` - ${(r as { full_name: string }).full_name}`);
const noDeal = await db.execute(sql`
select c.full_name from clients c
join migration_source_links l on l.target_entity_id=c.id and l.target_entity_type='client'
where not exists (select 1 from interests i where i.client_id=c.id)`);
console.log(` migrated clients with NO deal: ${noDeal.length}`);
const noStage = await db.execute(sql`
select count(*) n from interests i
join migration_source_links l on l.target_entity_id=i.id and l.target_entity_type='interest'
where i.pipeline_stage is null`);
console.log(` migrated deals with NULL stage: ${(noStage[0] as { n: number }).n}`);
// sample full dump to eyeball
console.log(`\n -- sample of 6 migrated clients (eyeball) --`);
const sample = await db.execute(sql`
select c.full_name,
(select string_agg(cc.channel||':'||cc.value, ', ') from client_contacts cc where cc.client_id=c.id) contacts,
(select count(*) from interests i where i.client_id=c.id) deals,
(select string_agg(distinct i.pipeline_stage, ',') from interests i where i.client_id=c.id) stages
from clients c
join migration_source_links l on l.target_entity_id=c.id and l.target_entity_type='client'
order by deals desc nulls last limit 6`);
for (const r of sample as unknown as Array<{
full_name: string;
contacts: string;
deals: number;
stages: string;
}>) {
console.log(
` ${r.full_name} · ${r.deals} deal(s) [${r.stages}] · ${r.contacts ?? '(no contacts)'}`,
);
}
await closeDb();
process.exit(0);
}
main().catch(async (e) => {
console.error('verify failed:', e);
await closeDb().catch(() => {});
process.exit(1);
});