/** * Importer for an organized S3 / filesystem bucket whose folder structure * already represents real organisation. Walks every key under `--bucket-prefix`, * builds matching `document_folders` rows mirroring the path, then inserts * `documents` + `files` rows pointing at the existing storage keys verbatim * — no path rewrite. Use when migrating from a legacy MinIO bucket whose * tree is the source of truth. * * Usage: * pnpm tsx scripts/import-organized-documents.ts --port-slug \ * --bucket-prefix "legacy-imports/" --dry-run * pnpm tsx scripts/import-organized-documents.ts --port-slug \ * --bucket-prefix "legacy-imports/" --apply * * Idempotency: * - Folders: sibling-name unique index swallows duplicate creates and we * reuse the existing row. * - Documents: skipped when a row with `(port_id, fileStoragePath)` already * exists — the storage key is the natural identity for this importer. */ import 'dotenv/config'; import path from 'node:path'; import { and, eq } from 'drizzle-orm'; import { db } from '@/lib/db'; import { ports } from '@/lib/db/schema/ports'; import { documents, documentFolders, files } from '@/lib/db/schema/documents'; import { user } from '@/lib/db/schema/users'; import { createAuditLog } from '@/lib/audit'; import { ConflictError } from '@/lib/errors'; import { createFolder } from '@/lib/services/document-folders.service'; import { parseImportPath } from '@/lib/services/document-import'; import { getStorageBackend } from '@/lib/storage'; interface CliArgs { portSlug: string; bucketPrefix: string; dryRun: boolean; apply: boolean; uploadedByUserId: string | null; } function parseArgs(argv: string[]): CliArgs { const args: CliArgs = { portSlug: '', bucketPrefix: '', dryRun: false, apply: false, uploadedByUserId: null, }; for (let i = 0; i < argv.length; i += 1) { const a = argv[i]!; if (a === '--port-slug') args.portSlug = argv[++i] ?? ''; else if (a === '--bucket-prefix') args.bucketPrefix = argv[++i] ?? ''; else if (a === '--uploaded-by') args.uploadedByUserId = argv[++i] ?? null; else if (a === '--dry-run') args.dryRun = true; else if (a === '--apply') args.apply = true; else if (a === '-h' || a === '--help') { printHelp(); process.exit(0); } else { console.error(`Unknown argument: ${a}`); printHelp(); process.exit(1); } } if (!args.portSlug) { console.error('Missing required --port-slug'); process.exit(1); } if (!args.dryRun && !args.apply) { console.error('Must specify either --dry-run or --apply.'); process.exit(1); } if (args.dryRun && args.apply) { console.error('--dry-run and --apply are mutually exclusive.'); process.exit(1); } return args; } function printHelp(): void { console.log(`Usage: pnpm tsx scripts/import-organized-documents.ts \\ --port-slug \\ --bucket-prefix \\ (--dry-run | --apply) \\ [--uploaded-by ] `); } interface PlannedDoc { key: string; folderSegments: string[]; filename: string; bytes: number | null; contentType: string; alreadyImported: boolean; } const CONTENT_TYPE_BY_EXT: Record = { '.pdf': 'application/pdf', '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.gif': 'image/gif', '.webp': 'image/webp', '.txt': 'text/plain', '.csv': 'text/csv', '.doc': 'application/msword', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.xls': 'application/vnd.ms-excel', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', }; function guessContentType(filename: string): string { const ext = path.extname(filename).toLowerCase(); return CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream'; } async function main(): Promise { const args = parseArgs(process.argv.slice(2)); const port = await db.query.ports.findFirst({ where: eq(ports.slug, args.portSlug) }); if (!port) { console.error(`Port not found: ${args.portSlug}`); process.exit(1); } let uploadedById = args.uploadedByUserId; if (!uploadedById) { const [u] = await db.select({ id: user.id }).from(user).limit(1); if (!u) { console.error( 'No user rows exist; pass --uploaded-by or seed at least one user before running.', ); process.exit(1); } uploadedById = u.id; console.log(`No --uploaded-by provided; falling back to first user: ${uploadedById}`); } const backend = await getStorageBackend(); console.log(`Listing keys under prefix "${args.bucketPrefix}" via ${backend.name} backend …`); const keys = await backend.listByPrefix(args.bucketPrefix); console.log(`Found ${keys.length} candidate keys.`); const plan: PlannedDoc[] = []; for (const key of keys) { const parsed = parseImportPath(args.bucketPrefix, key); if (!parsed.filename) continue; const head = await backend.head(key); const existing = await db.query.files.findFirst({ where: and(eq(files.portId, port.id), eq(files.storagePath, key)), columns: { id: true }, }); plan.push({ key, folderSegments: parsed.folderSegments, filename: parsed.filename, bytes: head?.sizeBytes ?? null, contentType: head?.contentType ?? guessContentType(parsed.filename), alreadyImported: !!existing, }); } printPlan(plan); if (args.dryRun) { console.log('\nDry-run complete. No changes written.'); return; } const folderIdByPath = new Map(); folderIdByPath.set('', null); let createdCount = 0; let skippedCount = 0; for (const entry of plan) { if (entry.alreadyImported) { skippedCount += 1; continue; } const folderId = await ensureFolderChain( port.id, uploadedById, entry.folderSegments, folderIdByPath, ); await db.transaction(async (tx) => { const [fileRow] = await tx .insert(files) .values({ portId: port.id, filename: entry.filename, originalName: entry.filename, mimeType: entry.contentType, sizeBytes: entry.bytes !== null ? String(entry.bytes) : null, storagePath: entry.key, uploadedBy: uploadedById, category: 'misc', folderId, }) .returning(); const [docRow] = await tx .insert(documents) .values({ portId: port.id, documentType: 'other', title: entry.filename, createdBy: uploadedById, folderId, fileId: fileRow!.id, status: 'completed', isManualUpload: true, }) .returning(); void createAuditLog({ userId: uploadedById, portId: port.id, action: 'create', entityType: 'document', entityId: docRow!.id, metadata: { source: 'organized-bucket-importer', storageKey: entry.key, folderSegments: entry.folderSegments, }, }); }); createdCount += 1; console.log(`✓ Imported ${entry.key}`); } console.log( `\nDone. Created ${createdCount} documents, skipped ${skippedCount} (already imported).`, ); } async function ensureFolderChain( portId: string, userId: string, segments: string[], cache: Map, ): Promise { if (segments.length === 0) return null; let parentId: string | null = null; for (let i = 0; i < segments.length; i += 1) { const pathKey = segments.slice(0, i + 1).join('/'); const cached = cache.get(pathKey); if (cached !== undefined) { parentId = cached; continue; } const name = segments[i]!; parentId = await createOrFindFolder(portId, userId, name, parentId); cache.set(pathKey, parentId); } return parentId; } async function createOrFindFolder( portId: string, userId: string, name: string, parentId: string | null, ): Promise { try { const created = await createFolder(portId, userId, { name, parentId }); return created.id; } catch (err) { if (!(err instanceof ConflictError)) throw err; // Sibling-name unique index hit — fetch the existing row so the import // remains idempotent across re-runs. const trimmed = name.trim(); const candidates = await db.query.documentFolders.findMany({ where: parentId ? and(eq(documentFolders.portId, portId), eq(documentFolders.parentId, parentId)) : eq(documentFolders.portId, portId), }); const existing = candidates.find( (row) => (parentId ? row.parentId === parentId : row.parentId === null) && row.name.toLowerCase() === trimmed.toLowerCase(), ); if (!existing) throw err; return existing.id; } } function printPlan(plan: PlannedDoc[]): void { const grouped = new Map(); for (const entry of plan) { const folder = entry.folderSegments.join('/') || '(root)'; if (!grouped.has(folder)) grouped.set(folder, []); grouped.get(folder)!.push(entry); } const folderNames = Array.from(grouped.keys()).sort(); console.log('\nPlan:'); for (const folder of folderNames) { console.log(` ${folder}/`); for (const entry of grouped.get(folder)!) { const flag = entry.alreadyImported ? '·' : '+'; const size = entry.bytes !== null ? ` (${entry.bytes}B)` : ''; console.log(` ${flag} ${entry.filename}${size}`); } } const newCount = plan.filter((p) => !p.alreadyImported).length; const dupCount = plan.length - newCount; console.log(`\nTotal: ${plan.length} keys → ${newCount} new, ${dupCount} already imported.`); } main() .then(() => process.exit(0)) .catch((err) => { console.error(err); process.exit(1); });