feat(documents): importer for organized S3/filesystem buckets

One-shot script that walks an existing organized bucket tree, builds
matching document_folders rows mirroring the path, then inserts
documents + files rows pointing at the existing storage keys verbatim
— no path rewrite. For migrating from a legacy MinIO bucket whose
folder structure is already the source of truth.

Idempotency:
  • Folders: sibling-name unique index swallows duplicate creates;
    we reuse the row on ConflictError.
  • Documents: skipped when (port_id, fileStoragePath) already exists.

Adds StorageBackend.listByPrefix (recursive readdir on filesystem;
listObjectsV2 stream-drain on s3) — the first one-shot caller, not
a hot path. Pure parseImportPath helper extracted to its own module
and unit-tested for trailing slashes, empty intermediate segments,
prefix mismatch, and special-character folder names (8 tests).

Audit log per imported doc carries source='organized-bucket-importer',
storageKey, and folderSegments so the documents inspector can filter
on imports later.

CLI:
  pnpm tsx scripts/import-organized-documents.ts \\
      --port-slug <slug> \\
      --bucket-prefix "legacy-imports/" \\
      (--dry-run | --apply) [--uploaded-by <userId>]

Folds in Prettier post-hook drift on documents.service.ts +
download handler — same lint-staged formatting the earlier commits
already absorbed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-10 16:53:51 +02:00
parent e790ff708b
commit ef63e86fde
8 changed files with 495 additions and 9 deletions

View File

@@ -0,0 +1,48 @@
/**
* Pure helpers for the organized-bucket document importer
* (`scripts/import-organized-documents.ts`).
*
* The script walks an existing storage prefix that already represents real
* organisation (e.g. `legacy-imports/Deals 2026/Q1/contract.pdf`) and
* materialises matching `document_folders` + `documents` rows in the CRM
* without rewriting the storage keys. Splitting these helpers out of the
* script body makes the path-parser unit-testable in isolation.
*/
export interface ParsedImportPath {
/** Folder names from outermost to innermost; empty when the file is at the prefix root. */
folderSegments: string[];
/** Filename only, never empty. */
filename: string;
}
/**
* Decompose a storage key into folder segments + filename relative to the
* importer prefix. Both `prefix` and `key` use POSIX separators (the
* filesystem backend's `listByPrefix` already normalises Windows paths).
*
* Edge cases:
* - Trailing slashes on prefix are tolerated (`legacy/` ≡ `legacy`).
* - Empty intermediate segments (`a//b`) collapse to `[a, b]`.
* - Leading-prefix mismatch throws — the caller should never feed in keys
* outside the prefix it asked the backend to list.
* - A key that ends in `/` (directory placeholder) yields an empty
* filename — the caller must filter those out before invoking.
*/
export function parseImportPath(prefix: string, key: string): ParsedImportPath {
const normalizedPrefix = prefix.replace(/\/+$/, '');
let relative = key;
if (normalizedPrefix.length > 0) {
if (!key.startsWith(`${normalizedPrefix}/`)) {
throw new Error(`Key "${key}" is not under prefix "${prefix}"`);
}
relative = key.slice(normalizedPrefix.length + 1);
}
const parts = relative.split('/').filter((segment) => segment.length > 0);
if (parts.length === 0) {
throw new Error(`Key "${key}" has no filename after stripping prefix`);
}
const filename = parts.pop()!;
return { folderSegments: parts, filename };
}

View File

@@ -307,10 +307,7 @@ async function hydrateDocumentsWithDownloadUrl(
const filename = row.fileId ? (filenameById.get(row.fileId) ?? null) : null;
return {
...row,
downloadUrl: buildDocumentDownloadUrl(
{ id: row.id, folderId: row.folderId, filename },
tree,
),
downloadUrl: buildDocumentDownloadUrl({ id: row.id, folderId: row.folderId, filename }, tree),
};
});
}

View File

@@ -361,6 +361,43 @@ export class FilesystemBackend implements StorageBackend {
};
}
/**
* Recursive readdir under `${root}/${prefix}`. Returns relative-to-root
* keys using POSIX separators, sorted alphabetically. Empty prefix lists
* every file in the storage root. Used by one-shot importers; not a hot
* path. We tolerate ENOENT (prefix doesn't exist) by returning [] so the
* caller doesn't have to special-case empty trees.
*/
async listByPrefix(prefix: string): Promise<string[]> {
const startAbs = prefix
? this.resolveKey(prefix.replace(/\/+$/, ''))
: this.rootResolved;
const out: string[] = [];
async function walk(dir: string): Promise<void> {
let entries: import('node:fs').Dirent[];
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch (err) {
if ((err as NodeJS.ErrnoException).code === 'ENOENT') return;
throw err;
}
for (const entry of entries) {
const child = path.join(dir, entry.name);
if (entry.isDirectory()) {
await walk(child);
} else if (entry.isFile()) {
out.push(child);
}
}
}
await walk(startAbs);
return out
.map((abs) => path.relative(this.rootResolved, abs).split(path.sep).join('/'))
.sort();
}
/** Used by the proxy route — returns the validated absolute path. */
resolveKeyForProxy(key: string): string {
return this.resolveKey(key);

View File

@@ -72,6 +72,15 @@ export interface StorageBackend {
/** Generate a short-lived URL the browser can GET from. */
presignDownload(key: string, opts: PresignOpts): Promise<{ url: string; expiresAt: Date }>;
/**
* Recursively list keys under `prefix`. Returns the relative key for each
* object, sorted alphabetically. Empty prefix means "the entire bucket /
* storage root". Used by one-shot importers (e.g. organized-bucket
* document import) that need to walk a flat key namespace; not meant for
* runtime hot paths.
*/
listByPrefix(prefix: string): Promise<string[]>;
readonly name: StorageBackendName;
}

View File

@@ -211,6 +211,22 @@ export class S3Backend implements StorageBackend {
return { url, expiresAt: new Date(Date.now() + expiry * 1000) };
}
/**
* Recursive listObjectsV2 walk under `prefix`. The minio-js stream emits
* one entry per object; we drain it into a flat key array sorted
* alphabetically. Used by one-shot importers; not a hot path. Object
* "directories" (zero-byte placeholders ending in `/`) are filtered out.
*/
async listByPrefix(prefix: string): Promise<string[]> {
const stream = this.client.listObjectsV2(this.bucket, prefix, true);
const keys: string[] = [];
for await (const obj of stream as AsyncIterable<{ name?: string }>) {
if (obj.name && !obj.name.endsWith('/')) keys.push(obj.name);
}
keys.sort();
return keys;
}
/** Used by the admin UI's "Test connection" button. */
async healthCheck(): Promise<{ ok: true } | { ok: false; error: string }> {
const sentinelKey = `_health/${Date.now()}.txt`;