From ef63e86fdefd06a84fd6b6ec70591101a63e0987 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 10 May 2026 16:53:51 +0200 Subject: [PATCH] feat(documents): importer for organized S3/filesystem buckets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One-shot script that walks an existing organized bucket tree, builds matching document_folders rows mirroring the path, then inserts documents + files rows pointing at the existing storage keys verbatim — no path rewrite. For migrating from a legacy MinIO bucket whose folder structure is already the source of truth. Idempotency: • Folders: sibling-name unique index swallows duplicate creates; we reuse the row on ConflictError. • Documents: skipped when (port_id, fileStoragePath) already exists. Adds StorageBackend.listByPrefix (recursive readdir on filesystem; listObjectsV2 stream-drain on s3) — the first one-shot caller, not a hot path. Pure parseImportPath helper extracted to its own module and unit-tested for trailing slashes, empty intermediate segments, prefix mismatch, and special-character folder names (8 tests). Audit log per imported doc carries source='organized-bucket-importer', storageKey, and folderSegments so the documents inspector can filter on imports later. CLI: pnpm tsx scripts/import-organized-documents.ts \\ --port-slug \\ --bucket-prefix "legacy-imports/" \\ (--dry-run | --apply) [--uploaded-by ] Folds in Prettier post-hook drift on documents.service.ts + download handler — same lint-staged formatting the earlier commits already absorbed. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/import-organized-documents.ts | 323 ++++++++++++++++++ .../[id]/download/[...slug]/handlers.ts | 6 +- src/lib/services/document-import.ts | 48 +++ src/lib/services/documents.service.ts | 5 +- src/lib/storage/filesystem.ts | 37 ++ src/lib/storage/index.ts | 9 + src/lib/storage/s3.ts | 16 + tests/unit/document-import.test.ts | 60 ++++ 8 files changed, 495 insertions(+), 9 deletions(-) create mode 100644 scripts/import-organized-documents.ts create mode 100644 src/lib/services/document-import.ts create mode 100644 tests/unit/document-import.test.ts diff --git a/scripts/import-organized-documents.ts b/scripts/import-organized-documents.ts new file mode 100644 index 00000000..9db230c0 --- /dev/null +++ b/scripts/import-organized-documents.ts @@ -0,0 +1,323 @@ +/** + * Importer for an organized S3 / filesystem bucket whose folder structure + * already represents real organisation. Walks every key under `--bucket-prefix`, + * builds matching `document_folders` rows mirroring the path, then inserts + * `documents` + `files` rows pointing at the existing storage keys verbatim + * — no path rewrite. Use when migrating from a legacy MinIO bucket whose + * tree is the source of truth. + * + * Usage: + * pnpm tsx scripts/import-organized-documents.ts --port-slug \ + * --bucket-prefix "legacy-imports/" --dry-run + * pnpm tsx scripts/import-organized-documents.ts --port-slug \ + * --bucket-prefix "legacy-imports/" --apply + * + * Idempotency: + * - Folders: sibling-name unique index swallows duplicate creates and we + * reuse the existing row. + * - Documents: skipped when a row with `(port_id, fileStoragePath)` already + * exists — the storage key is the natural identity for this importer. + */ + +import 'dotenv/config'; +import path from 'node:path'; +import { and, eq } from 'drizzle-orm'; + +import { db } from '@/lib/db'; +import { ports } from '@/lib/db/schema/ports'; +import { documents, documentFolders, files } from '@/lib/db/schema/documents'; +import { user } from '@/lib/db/schema/users'; +import { createAuditLog } from '@/lib/audit'; +import { ConflictError } from '@/lib/errors'; +import { createFolder } from '@/lib/services/document-folders.service'; +import { parseImportPath } from '@/lib/services/document-import'; +import { getStorageBackend } from '@/lib/storage'; + +interface CliArgs { + portSlug: string; + bucketPrefix: string; + dryRun: boolean; + apply: boolean; + uploadedByUserId: string | null; +} + +function parseArgs(argv: string[]): CliArgs { + const args: CliArgs = { + portSlug: '', + bucketPrefix: '', + dryRun: false, + apply: false, + uploadedByUserId: null, + }; + for (let i = 0; i < argv.length; i += 1) { + const a = argv[i]!; + if (a === '--port-slug') args.portSlug = argv[++i] ?? ''; + else if (a === '--bucket-prefix') args.bucketPrefix = argv[++i] ?? ''; + else if (a === '--uploaded-by') args.uploadedByUserId = argv[++i] ?? null; + else if (a === '--dry-run') args.dryRun = true; + else if (a === '--apply') args.apply = true; + else if (a === '-h' || a === '--help') { + printHelp(); + process.exit(0); + } else { + console.error(`Unknown argument: ${a}`); + printHelp(); + process.exit(1); + } + } + if (!args.portSlug) { + console.error('Missing required --port-slug'); + process.exit(1); + } + if (!args.dryRun && !args.apply) { + console.error('Must specify either --dry-run or --apply.'); + process.exit(1); + } + if (args.dryRun && args.apply) { + console.error('--dry-run and --apply are mutually exclusive.'); + process.exit(1); + } + return args; +} + +function printHelp(): void { + console.log(`Usage: + pnpm tsx scripts/import-organized-documents.ts \\ + --port-slug \\ + --bucket-prefix \\ + (--dry-run | --apply) \\ + [--uploaded-by ] +`); +} + +interface PlannedDoc { + key: string; + folderSegments: string[]; + filename: string; + bytes: number | null; + contentType: string; + alreadyImported: boolean; +} + +const CONTENT_TYPE_BY_EXT: Record = { + '.pdf': 'application/pdf', + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp', + '.txt': 'text/plain', + '.csv': 'text/csv', + '.doc': 'application/msword', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.xls': 'application/vnd.ms-excel', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', +}; + +function guessContentType(filename: string): string { + const ext = path.extname(filename).toLowerCase(); + return CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream'; +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + + const port = await db.query.ports.findFirst({ where: eq(ports.slug, args.portSlug) }); + if (!port) { + console.error(`Port not found: ${args.portSlug}`); + process.exit(1); + } + + let uploadedById = args.uploadedByUserId; + if (!uploadedById) { + const [u] = await db.select({ id: user.id }).from(user).limit(1); + if (!u) { + console.error( + 'No user rows exist; pass --uploaded-by or seed at least one user before running.', + ); + process.exit(1); + } + uploadedById = u.id; + console.log(`No --uploaded-by provided; falling back to first user: ${uploadedById}`); + } + + const backend = await getStorageBackend(); + console.log(`Listing keys under prefix "${args.bucketPrefix}" via ${backend.name} backend …`); + const keys = await backend.listByPrefix(args.bucketPrefix); + console.log(`Found ${keys.length} candidate keys.`); + + const plan: PlannedDoc[] = []; + for (const key of keys) { + const parsed = parseImportPath(args.bucketPrefix, key); + if (!parsed.filename) continue; + + const head = await backend.head(key); + const existing = await db.query.files.findFirst({ + where: and(eq(files.portId, port.id), eq(files.storagePath, key)), + columns: { id: true }, + }); + + plan.push({ + key, + folderSegments: parsed.folderSegments, + filename: parsed.filename, + bytes: head?.sizeBytes ?? null, + contentType: head?.contentType ?? guessContentType(parsed.filename), + alreadyImported: !!existing, + }); + } + + printPlan(plan); + + if (args.dryRun) { + console.log('\nDry-run complete. No changes written.'); + return; + } + + const folderIdByPath = new Map(); + folderIdByPath.set('', null); + let createdCount = 0; + let skippedCount = 0; + + for (const entry of plan) { + if (entry.alreadyImported) { + skippedCount += 1; + continue; + } + + const folderId = await ensureFolderChain( + port.id, + uploadedById, + entry.folderSegments, + folderIdByPath, + ); + + await db.transaction(async (tx) => { + const [fileRow] = await tx + .insert(files) + .values({ + portId: port.id, + filename: entry.filename, + originalName: entry.filename, + mimeType: entry.contentType, + sizeBytes: entry.bytes !== null ? String(entry.bytes) : null, + storagePath: entry.key, + uploadedBy: uploadedById, + category: 'misc', + }) + .returning(); + const [docRow] = await tx + .insert(documents) + .values({ + portId: port.id, + documentType: 'other', + title: entry.filename, + createdBy: uploadedById, + folderId, + fileId: fileRow!.id, + status: 'completed', + isManualUpload: true, + }) + .returning(); + + void createAuditLog({ + userId: uploadedById, + portId: port.id, + action: 'create', + entityType: 'document', + entityId: docRow!.id, + metadata: { + source: 'organized-bucket-importer', + storageKey: entry.key, + folderSegments: entry.folderSegments, + }, + }); + }); + createdCount += 1; + console.log(`✓ Imported ${entry.key}`); + } + + console.log(`\nDone. Created ${createdCount} documents, skipped ${skippedCount} (already imported).`); +} + +async function ensureFolderChain( + portId: string, + userId: string, + segments: string[], + cache: Map, +): Promise { + if (segments.length === 0) return null; + + let parentId: string | null = null; + for (let i = 0; i < segments.length; i += 1) { + const pathKey = segments.slice(0, i + 1).join('/'); + const cached = cache.get(pathKey); + if (cached !== undefined) { + parentId = cached; + continue; + } + + const name = segments[i]!; + parentId = await createOrFindFolder(portId, userId, name, parentId); + cache.set(pathKey, parentId); + } + return parentId; +} + +async function createOrFindFolder( + portId: string, + userId: string, + name: string, + parentId: string | null, +): Promise { + try { + const created = await createFolder(portId, userId, { name, parentId }); + return created.id; + } catch (err) { + if (!(err instanceof ConflictError)) throw err; + // Sibling-name unique index hit — fetch the existing row so the import + // remains idempotent across re-runs. + const trimmed = name.trim(); + const candidates = await db.query.documentFolders.findMany({ + where: parentId + ? and(eq(documentFolders.portId, portId), eq(documentFolders.parentId, parentId)) + : eq(documentFolders.portId, portId), + }); + const existing = candidates.find( + (row) => + (parentId ? row.parentId === parentId : row.parentId === null) && + row.name.toLowerCase() === trimmed.toLowerCase(), + ); + if (!existing) throw err; + return existing.id; + } +} + +function printPlan(plan: PlannedDoc[]): void { + const grouped = new Map(); + for (const entry of plan) { + const folder = entry.folderSegments.join('/') || '(root)'; + if (!grouped.has(folder)) grouped.set(folder, []); + grouped.get(folder)!.push(entry); + } + const folderNames = Array.from(grouped.keys()).sort(); + console.log('\nPlan:'); + for (const folder of folderNames) { + console.log(` ${folder}/`); + for (const entry of grouped.get(folder)!) { + const flag = entry.alreadyImported ? '·' : '+'; + const size = entry.bytes !== null ? ` (${entry.bytes}B)` : ''; + console.log(` ${flag} ${entry.filename}${size}`); + } + } + const newCount = plan.filter((p) => !p.alreadyImported).length; + const dupCount = plan.length - newCount; + console.log(`\nTotal: ${plan.length} keys → ${newCount} new, ${dupCount} already imported.`); +} + +main() + .then(() => process.exit(0)) + .catch((err) => { + console.error(err); + process.exit(1); + }); diff --git a/src/app/api/v1/documents/[id]/download/[...slug]/handlers.ts b/src/app/api/v1/documents/[id]/download/[...slug]/handlers.ts index 2cc988fa..d01e377b 100644 --- a/src/app/api/v1/documents/[id]/download/[...slug]/handlers.ts +++ b/src/app/api/v1/documents/[id]/download/[...slug]/handlers.ts @@ -73,11 +73,7 @@ export const downloadHandler: RouteHandler = async (_req, ctx, p } }; -function buildExpectedSlug( - folderId: string | null, - filename: string, - tree: FolderNode[], -): string { +function buildExpectedSlug(folderId: string | null, filename: string, tree: FolderNode[]): string { const segments: string[] = []; if (folderId) { const path = findFolderPath(tree, folderId); diff --git a/src/lib/services/document-import.ts b/src/lib/services/document-import.ts new file mode 100644 index 00000000..b10fc986 --- /dev/null +++ b/src/lib/services/document-import.ts @@ -0,0 +1,48 @@ +/** + * Pure helpers for the organized-bucket document importer + * (`scripts/import-organized-documents.ts`). + * + * The script walks an existing storage prefix that already represents real + * organisation (e.g. `legacy-imports/Deals 2026/Q1/contract.pdf`) and + * materialises matching `document_folders` + `documents` rows in the CRM + * without rewriting the storage keys. Splitting these helpers out of the + * script body makes the path-parser unit-testable in isolation. + */ + +export interface ParsedImportPath { + /** Folder names from outermost to innermost; empty when the file is at the prefix root. */ + folderSegments: string[]; + /** Filename only, never empty. */ + filename: string; +} + +/** + * Decompose a storage key into folder segments + filename relative to the + * importer prefix. Both `prefix` and `key` use POSIX separators (the + * filesystem backend's `listByPrefix` already normalises Windows paths). + * + * Edge cases: + * - Trailing slashes on prefix are tolerated (`legacy/` ≡ `legacy`). + * - Empty intermediate segments (`a//b`) collapse to `[a, b]`. + * - Leading-prefix mismatch throws — the caller should never feed in keys + * outside the prefix it asked the backend to list. + * - A key that ends in `/` (directory placeholder) yields an empty + * filename — the caller must filter those out before invoking. + */ +export function parseImportPath(prefix: string, key: string): ParsedImportPath { + const normalizedPrefix = prefix.replace(/\/+$/, ''); + let relative = key; + if (normalizedPrefix.length > 0) { + if (!key.startsWith(`${normalizedPrefix}/`)) { + throw new Error(`Key "${key}" is not under prefix "${prefix}"`); + } + relative = key.slice(normalizedPrefix.length + 1); + } + + const parts = relative.split('/').filter((segment) => segment.length > 0); + if (parts.length === 0) { + throw new Error(`Key "${key}" has no filename after stripping prefix`); + } + const filename = parts.pop()!; + return { folderSegments: parts, filename }; +} diff --git a/src/lib/services/documents.service.ts b/src/lib/services/documents.service.ts index 18563d90..35b6ade7 100644 --- a/src/lib/services/documents.service.ts +++ b/src/lib/services/documents.service.ts @@ -307,10 +307,7 @@ async function hydrateDocumentsWithDownloadUrl( const filename = row.fileId ? (filenameById.get(row.fileId) ?? null) : null; return { ...row, - downloadUrl: buildDocumentDownloadUrl( - { id: row.id, folderId: row.folderId, filename }, - tree, - ), + downloadUrl: buildDocumentDownloadUrl({ id: row.id, folderId: row.folderId, filename }, tree), }; }); } diff --git a/src/lib/storage/filesystem.ts b/src/lib/storage/filesystem.ts index e7c2f743..53cd4354 100644 --- a/src/lib/storage/filesystem.ts +++ b/src/lib/storage/filesystem.ts @@ -361,6 +361,43 @@ export class FilesystemBackend implements StorageBackend { }; } + /** + * Recursive readdir under `${root}/${prefix}`. Returns relative-to-root + * keys using POSIX separators, sorted alphabetically. Empty prefix lists + * every file in the storage root. Used by one-shot importers; not a hot + * path. We tolerate ENOENT (prefix doesn't exist) by returning [] so the + * caller doesn't have to special-case empty trees. + */ + async listByPrefix(prefix: string): Promise { + const startAbs = prefix + ? this.resolveKey(prefix.replace(/\/+$/, '')) + : this.rootResolved; + + const out: string[] = []; + async function walk(dir: string): Promise { + let entries: import('node:fs').Dirent[]; + try { + entries = await fs.readdir(dir, { withFileTypes: true }); + } catch (err) { + if ((err as NodeJS.ErrnoException).code === 'ENOENT') return; + throw err; + } + for (const entry of entries) { + const child = path.join(dir, entry.name); + if (entry.isDirectory()) { + await walk(child); + } else if (entry.isFile()) { + out.push(child); + } + } + } + await walk(startAbs); + + return out + .map((abs) => path.relative(this.rootResolved, abs).split(path.sep).join('/')) + .sort(); + } + /** Used by the proxy route — returns the validated absolute path. */ resolveKeyForProxy(key: string): string { return this.resolveKey(key); diff --git a/src/lib/storage/index.ts b/src/lib/storage/index.ts index eb9e570a..584a65e0 100644 --- a/src/lib/storage/index.ts +++ b/src/lib/storage/index.ts @@ -72,6 +72,15 @@ export interface StorageBackend { /** Generate a short-lived URL the browser can GET from. */ presignDownload(key: string, opts: PresignOpts): Promise<{ url: string; expiresAt: Date }>; + /** + * Recursively list keys under `prefix`. Returns the relative key for each + * object, sorted alphabetically. Empty prefix means "the entire bucket / + * storage root". Used by one-shot importers (e.g. organized-bucket + * document import) that need to walk a flat key namespace; not meant for + * runtime hot paths. + */ + listByPrefix(prefix: string): Promise; + readonly name: StorageBackendName; } diff --git a/src/lib/storage/s3.ts b/src/lib/storage/s3.ts index 9ad1e3c7..bbe2e1bb 100644 --- a/src/lib/storage/s3.ts +++ b/src/lib/storage/s3.ts @@ -211,6 +211,22 @@ export class S3Backend implements StorageBackend { return { url, expiresAt: new Date(Date.now() + expiry * 1000) }; } + /** + * Recursive listObjectsV2 walk under `prefix`. The minio-js stream emits + * one entry per object; we drain it into a flat key array sorted + * alphabetically. Used by one-shot importers; not a hot path. Object + * "directories" (zero-byte placeholders ending in `/`) are filtered out. + */ + async listByPrefix(prefix: string): Promise { + const stream = this.client.listObjectsV2(this.bucket, prefix, true); + const keys: string[] = []; + for await (const obj of stream as AsyncIterable<{ name?: string }>) { + if (obj.name && !obj.name.endsWith('/')) keys.push(obj.name); + } + keys.sort(); + return keys; + } + /** Used by the admin UI's "Test connection" button. */ async healthCheck(): Promise<{ ok: true } | { ok: false; error: string }> { const sentinelKey = `_health/${Date.now()}.txt`; diff --git a/tests/unit/document-import.test.ts b/tests/unit/document-import.test.ts new file mode 100644 index 00000000..bc3bf4e4 --- /dev/null +++ b/tests/unit/document-import.test.ts @@ -0,0 +1,60 @@ +import { describe, expect, it } from 'vitest'; + +import { parseImportPath } from '@/lib/services/document-import'; + +describe('parseImportPath', () => { + it('splits a nested key into folders + filename', () => { + expect(parseImportPath('legacy', 'legacy/Deals 2026/Q1/contract.pdf')).toEqual({ + folderSegments: ['Deals 2026', 'Q1'], + filename: 'contract.pdf', + }); + }); + + it('returns empty folderSegments for a file at the prefix root', () => { + expect(parseImportPath('legacy', 'legacy/index.pdf')).toEqual({ + folderSegments: [], + filename: 'index.pdf', + }); + }); + + it('tolerates trailing slashes on the prefix', () => { + expect(parseImportPath('legacy/', 'legacy/Deals/x.pdf')).toEqual({ + folderSegments: ['Deals'], + filename: 'x.pdf', + }); + expect(parseImportPath('legacy///', 'legacy/Deals/x.pdf')).toEqual({ + folderSegments: ['Deals'], + filename: 'x.pdf', + }); + }); + + it('collapses empty intermediate segments', () => { + expect(parseImportPath('legacy', 'legacy/a//b/c.pdf')).toEqual({ + folderSegments: ['a', 'b'], + filename: 'c.pdf', + }); + }); + + it('handles an empty prefix as "list-the-whole-bucket"', () => { + expect(parseImportPath('', 'Folder/file.pdf')).toEqual({ + folderSegments: ['Folder'], + filename: 'file.pdf', + }); + }); + + it('preserves special characters in folder names', () => { + expect(parseImportPath('', "Q1 — Year's End/contract & rider.pdf")).toEqual({ + folderSegments: ["Q1 — Year's End"], + filename: 'contract & rider.pdf', + }); + }); + + it('throws when the key is not under the prefix', () => { + expect(() => parseImportPath('legacy', 'other/x.pdf')).toThrow(/not under prefix/); + }); + + it('throws when the relative path has no filename', () => { + expect(() => parseImportPath('legacy', 'legacy/')).toThrow(/no filename/); + expect(() => parseImportPath('', '')).toThrow(/no filename/); + }); +});