feat(migration): document backfill — legacy MinIO → CRM storage (Phase 2)

backfill-documents.ts pulls signed EOI PDFs + berth spec PDFs from the legacy
MinIO (client-portal bucket; read-only via dedicated LEGACY_MINIO_* creds) and
deposits them into the CRM (getStorageBackend), linking:
- berth PDFs → berth_pdf_versions + berths.current_pdf_version_id (mooring from
  filename; 113/113 matched)
- signed EOIs → documents.signed_file_id + status=completed + a files row filed
  into the client folder (exact name + conservative lev<=2 fuzzy; 33 linked)
Idempotent (skips when signedFileId / current_pdf_version_id already set).
Strictly prod-READ-only; all writes local (dev storage_backend=filesystem).
Unmatched EOIs reported (mostly in-flight deals w/ no signed PDF yet + old-LOI
docs in the NocoDB attachment bucket).

Adds probe-minio.ts (read-only bucket inventory).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-01 19:33:15 +02:00
parent 6c040a617b
commit 14ab8a8161
2 changed files with 416 additions and 0 deletions

View File

@@ -0,0 +1,314 @@
/**
* Phase 2 of the legacy migration: pull signed EOI PDFs + berth spec PDFs from
* the LEGACY MinIO (`client-portal` bucket) and deposit them into the CRM's own
* storage, linking them to the already-migrated deals + berths.
*
* Two storage worlds, kept strictly separate:
* - LEGACY read : a dedicated `minio` Client using LEGACY_MINIO_* env.
* - CRM write : `getStorageBackend()` (the CRM's own configured storage).
* ⚠ We NEVER route legacy creds through getStorageBackend — that would
* write INTO prod. LEGACY_MINIO_* is distinct from the CRM's MINIO_*.
*
* Idempotent + re-runnable: an EOI is skipped once its `documents.signedFileId`
* is set; a berth is skipped once it has a `currentPdfVersionId`.
*
* Run AFTER `migrate-from-nocodb.ts --apply`:
* LEGACY_MINIO_ACCESS_KEY=… LEGACY_MINIO_SECRET_KEY=… \
* pnpm tsx scripts/migration/backfill-documents.ts --port-slug port-nimara [--dry-run]
*/
import 'dotenv/config';
import { randomUUID } from 'node:crypto';
import { Client as MinioClient } from 'minio';
import { and, eq, isNull } from 'drizzle-orm';
import { db, closeDb } from '@/lib/db';
import { ports } from '@/lib/db/schema/ports';
import { berths } from '@/lib/db/schema/berths';
import { documents, files } from '@/lib/db/schema/documents';
import { clients } from '@/lib/db/schema/clients';
import { getStorageBackend } from '@/lib/storage';
import { buildStoragePath } from '@/lib/minio';
import { ensureEntityFolder } from '@/lib/services/document-folders.service';
import { uploadBerthPdf } from '@/lib/services/berth-pdf.service';
import { normalizeName } from '@/lib/dedup/normalize';
import { SUPER_ADMIN_USER_ID } from '@/lib/db/seed-bootstrap';
const DRY = process.argv.includes('--dry-run');
const slugArg = (() => {
const i = process.argv.indexOf('--port-slug');
return i >= 0 ? (process.argv[i + 1] ?? 'port-nimara') : 'port-nimara';
})();
const LEGACY_BUCKET = process.env.LEGACY_MINIO_BUCKET ?? 'client-portal';
const legacy = new MinioClient({
endPoint: process.env.LEGACY_MINIO_ENDPOINT ?? 's3.portnimara.com',
port: 443,
useSSL: true,
accessKey: process.env.LEGACY_MINIO_ACCESS_KEY ?? '',
secretKey: process.env.LEGACY_MINIO_SECRET_KEY ?? '',
});
/** Levenshtein edit distance — conservative fuzzy name matching for legacy
* spelling/format drift (Koshbin↔Khoshbin, Costanzo↔Constanzo). */
function lev(a: string, b: string): number {
const m = a.length;
const n = b.length;
if (!m) return n;
if (!n) return m;
let prev = Array.from({ length: n + 1 }, (_, i) => i);
for (let i = 1; i <= m; i++) {
const cur = [i];
for (let j = 1; j <= n; j++) {
cur[j] = Math.min(
prev[j]! + 1,
cur[j - 1]! + 1,
prev[j - 1]! + (a[i - 1] === b[j - 1] ? 0 : 1),
);
}
prev = cur;
}
return prev[n]!;
}
function streamToBuffer(stream: NodeJS.ReadableStream): Promise<Buffer> {
return new Promise((resolve, reject) => {
const chunks: Buffer[] = [];
stream.on('data', (c: Buffer) => chunks.push(c));
stream.on('end', () => resolve(Buffer.concat(chunks)));
stream.on('error', reject);
});
}
interface LegacyObject {
name: string;
size: number;
}
function listLegacy(prefix: string): Promise<LegacyObject[]> {
return new Promise((resolve, reject) => {
const out: LegacyObject[] = [];
const stream = legacy.listObjectsV2(LEGACY_BUCKET, prefix, true);
stream.on('data', (o) => {
if (o.name && !o.name.endsWith('/')) out.push({ name: o.name, size: o.size ?? 0 });
});
stream.on('end', () => resolve(out));
stream.on('error', reject);
});
}
async function resolvePort(slug: string): Promise<{ id: string; slug: string }> {
const [p] = await db
.select({ id: ports.id, slug: ports.slug })
.from(ports)
.where(eq(ports.slug, slug))
.limit(1);
if (!p) throw new Error(`No port with slug "${slug}"`);
return p;
}
// ─── Berth PDFs ──────────────────────────────────────────────────────────────
// client-portal/Berth-PDFs/<ts>-Berth_Spec_Sheet_<Mooring>.pdf → berth by mooring.
async function backfillBerthPdfs(port: { id: string; slug: string }) {
const objs = (await listLegacy('Berth-PDFs/')).filter((o) => /\.pdf$/i.test(o.name));
const berthRows = await db
.select({ id: berths.id, mooring: berths.mooringNumber, cur: berths.currentPdfVersionId })
.from(berths)
.where(eq(berths.portId, port.id));
const byMooring = new Map(berthRows.map((b) => [b.mooring, b]));
let attached = 0;
let skipped = 0;
let unmatched = 0;
for (const o of objs) {
const m = o.name.match(/Berth_Spec_Sheet_([A-Za-z]+\d+)\.pdf$/i);
if (!m) {
unmatched++;
continue;
}
const mooring = `${m[1]!.replace(/[a-z]+/g, (s) => s.toUpperCase())}`
.toUpperCase()
.replace(/([A-Z]+)0*(\d+)/, '$1$2');
const berth = byMooring.get(mooring);
if (!berth) {
console.log(` [berth] no berth for mooring "${mooring}" (${o.name})`);
unmatched++;
continue;
}
if (berth.cur) {
skipped++;
continue;
}
if (DRY) {
attached++;
continue;
}
const buf = await streamToBuffer(await legacy.getObject(LEGACY_BUCKET, o.name));
await uploadBerthPdf({
berthId: berth.id,
portId: port.id,
buffer: buf,
fileName: o.name.split('/').pop() ?? `${mooring}.pdf`,
uploadedBy: SUPER_ADMIN_USER_ID,
});
attached++;
}
return { total: objs.length, attached, skipped, unmatched };
}
// ─── Signed EOIs ─────────────────────────────────────────────────────────────
// client-portal/EOIs/<Client Name>/<file>.pdf → match by normalized client name.
async function backfillEois(port: { id: string; slug: string }) {
// Signed EOIs live under EOIs/<Name>/ and (some) under Client Documents/<Name>/.
const objs = [...(await listLegacy('EOIs/')), ...(await listLegacy('Client Documents/'))].filter(
(o) => /\.pdf$/i.test(o.name) && /eoi|sign/i.test(o.name),
);
// Index the best signed PDF per normalized folder (client) name.
const byName = new Map<string, { key: string; size: number }>();
for (const o of objs) {
const parts = o.name.split('/'); // <prefix> / <Name> / <file>.pdf
if (parts.length < 3) continue;
const folder = (parts[1] ?? '').replace(/_/g, ' '); // "Matt_Ciaccio" → "Matt Ciaccio"
const norm = normalizeName(folder).display;
if (!norm) continue;
const isSigned = /sign/i.test(o.name);
const prev = byName.get(norm);
// Prefer a "signed" file; among those, the largest (the full signed PDF).
if (!prev || (isSigned && o.size > prev.size)) byName.set(norm, { key: o.name, size: o.size });
}
// Migrated EOI documents missing a signed file.
const docRows = await db
.select({ id: documents.id, interestId: documents.interestId, clientId: documents.clientId })
.from(documents)
.where(
and(
eq(documents.portId, port.id),
eq(documents.documentType, 'eoi'),
isNull(documents.signedFileId),
),
);
const backend = await getStorageBackend();
let attached = 0;
let unmatched = 0;
const unresolved: string[] = [];
for (const doc of docRows) {
const clientId = doc.clientId;
if (!clientId) {
unmatched++;
continue;
}
const [c] = await db
.select({ name: clients.fullName })
.from(clients)
.where(eq(clients.id, clientId))
.limit(1);
if (!c) {
unmatched++;
continue;
}
const target = normalizeName(c.name).display;
let match = byName.get(target);
if (!match && target.length >= 6) {
// Conservative fuzzy fallback: best edit-distance ≤ 2 on the full name.
let bestDist = 3;
for (const [name, v] of byName) {
const d = lev(name, target);
if (d < bestDist) {
bestDist = d;
match = v;
}
}
}
if (!match) {
unresolved.push(c.name);
unmatched++;
continue;
}
if (DRY) {
attached++;
continue;
}
// Pull legacy bytes → write to CRM storage → files row → link signedFileId.
const buf = await streamToBuffer(await legacy.getObject(LEGACY_BUCKET, match.key));
const key = buildStoragePath(port.slug, 'eoi-signed', doc.id, randomUUID(), 'pdf');
const putRes = await backend.put(key, buf, {
contentType: 'application/pdf',
sizeBytes: buf.length,
});
// File into the client's entity folder (mirrors handleDocumentCompleted's
// owner-folder filing). files.interestId still scopes the row to the deal;
// interest "Deal" folders aren't system-managed (chk_system_folder_shape).
const folder = await ensureEntityFolder(port.id, 'client', clientId, SUPER_ADMIN_USER_ID);
const fileName = match.key.split('/').pop() ?? 'eoi-signed.pdf';
await db.transaction(async (tx) => {
const [f] = await tx
.insert(files)
.values({
portId: port.id,
filename: fileName,
originalName: fileName,
storagePath: putRes.key,
mimeType: 'application/pdf',
sizeBytes: String(putRes.sizeBytes),
category: 'eoi',
folderId: folder.id,
clientId,
interestId: doc.interestId,
uploadedBy: 'system',
})
.returning({ id: files.id });
if (!f) throw new Error('files insert returned no row');
await tx
.update(documents)
.set({ signedFileId: f.id, status: 'completed', isManualUpload: true })
.where(eq(documents.id, doc.id));
});
attached++;
}
return {
totalBlobs: objs.length,
indexedClients: byName.size,
candidates: docRows.length,
attached,
unmatched,
unresolved,
};
}
async function main() {
if (!process.env.LEGACY_MINIO_ACCESS_KEY || !process.env.LEGACY_MINIO_SECRET_KEY) {
console.error(
'Set LEGACY_MINIO_ACCESS_KEY + LEGACY_MINIO_SECRET_KEY (legacy MinIO read creds).',
);
process.exit(1);
}
const port = await resolvePort(slugArg);
console.log(
`[backfill] port=${port.slug} legacy-bucket=${LEGACY_BUCKET} ${DRY ? '(DRY RUN)' : ''}`,
);
console.log('[backfill] Berth PDFs…');
const berthRes = await backfillBerthPdfs(port);
console.log(
` berth PDFs: ${berthRes.total} blobs → ${berthRes.attached} attached, ${berthRes.skipped} already had one, ${berthRes.unmatched} unmatched`,
);
console.log('[backfill] Signed EOIs…');
const eoiRes = await backfillEois(port);
console.log(
` EOIs: ${eoiRes.totalBlobs} blobs (${eoiRes.indexedClients} client folders) · ${eoiRes.candidates} migrated EOI docs needing a file → ${eoiRes.attached} attached, ${eoiRes.unmatched} unmatched`,
);
if (eoiRes.unresolved.length > 0) {
console.log(` ⚠ EOI docs with no name-matched legacy PDF (${eoiRes.unresolved.length}):`);
for (const n of eoiRes.unresolved.slice(0, 25)) console.log(` - ${n}`);
}
await closeDb();
process.exit(0);
}
main().catch(async (err) => {
console.error('[backfill] failed:', err);
await closeDb().catch(() => {});
process.exit(1);
});

View File

@@ -0,0 +1,102 @@
/**
* Read-only MinIO inventory for the legacy → new-CRM migration (Phase 2 sizing).
*
* Lists every bucket the creds can see, then for the document buckets
* (`client-portal`, `signatures`) groups objects by top-level prefix with
* counts + sizes + samples — so we can see exactly where the EOIs, berth
* PDFs, receipts and business-card images live before backfilling them.
*
* Secret-free: reads creds from env. Run with:
* MINIO_ACCESS_KEY=... MINIO_SECRET_KEY=... \
* pnpm tsx scripts/migration/probe-minio.ts
*
* Strictly read-only (listBuckets + listObjectsV2). No writes.
*/
import { Client } from 'minio';
const endPoint = process.env.MINIO_ENDPOINT || 's3.portnimara.com';
const accessKey = process.env.MINIO_ACCESS_KEY;
const secretKey = process.env.MINIO_SECRET_KEY;
if (!accessKey || !secretKey) {
console.error('Set MINIO_ACCESS_KEY and MINIO_SECRET_KEY');
process.exit(1);
}
const client = new Client({ endPoint, port: 443, useSSL: true, accessKey, secretKey });
interface PrefixStat {
count: number;
bytes: number;
samples: string[];
}
async function inventory(bucket: string) {
const byPrefix = new Map<string, PrefixStat>();
let total = 0;
let totalBytes = 0;
await new Promise<void>((resolve, reject) => {
const stream = client.listObjectsV2(bucket, '', true);
stream.on('data', (o) => {
if (!o.name) return;
total++;
totalBytes += o.size || 0;
const top = o.name.includes('/') ? o.name.split('/')[0] + '/' : '(root)';
const e = byPrefix.get(top) || { count: 0, bytes: 0, samples: [] };
e.count++;
e.bytes += o.size || 0;
if (e.samples.length < 4) e.samples.push(`${o.name} (${o.size}b)`);
byPrefix.set(top, e);
});
stream.on('end', () => resolve());
stream.on('error', reject);
});
return { bucket, total, totalBytes, byPrefix };
}
const mb = (b: number) => (b / 1e6).toFixed(1);
async function main() {
console.log(`MinIO @ ${endPoint}\n`);
let buckets: string[] = [];
try {
const list = await client.listBuckets();
buckets = list.map((b) => b.name);
console.log('=== all buckets visible to these creds ===');
for (const b of list) console.log(` ${b.name}`);
} catch (err) {
console.log(`listBuckets failed: ${(err as Error).message}`);
}
const targets = (process.env.MINIO_BUCKETS || 'client-portal,signatures')
.split(',')
.map((s) => s.trim());
for (const bucket of targets) {
if (buckets.length && !buckets.includes(bucket)) {
console.log(`\n=== bucket: ${bucket} — NOT VISIBLE to these creds ===`);
continue;
}
try {
const inv = await inventory(bucket);
console.log(
`\n=== bucket: ${inv.bucket}${inv.total} objects, ${mb(inv.totalBytes)} MB ===`,
);
const rows = [...inv.byPrefix.entries()].sort((a, z) => z[1].count - a[1].count);
for (const [prefix, e] of rows) {
console.log(
` ${prefix.padEnd(30)} ${String(e.count).padStart(5)} obj ${mb(e.bytes).padStart(8)} MB`,
);
for (const s of e.samples) console.log(` e.g. ${s}`);
}
} catch (err) {
console.log(`\n=== bucket: ${bucket} — ERROR: ${(err as Error).message} ===`);
}
}
}
main().catch((err) => {
console.error('probe-minio failed:', err);
process.exit(1);
});