/** * Data Anonymization Service * * Strips PII (names, emails, etc.) from data before sending to AI services. * Returns ID mappings for de-anonymization of results. * * GDPR Compliance: * - All personal identifiers are stripped before AI processing * - Project/user IDs are replaced with sequential anonymous IDs * - Text content is sanitized to remove emails, phones, URLs * - Validation ensures no PII leakage before each AI call */ import type { CompetitionCategory, OceanIssue, FileType, SubmissionSource, } from '@prisma/client' // ─── Description Limits ────────────────────────────────────────────────────── export const DESCRIPTION_LIMITS = { ASSIGNMENT: 300, FILTERING: 500, ELIGIBILITY: 400, MENTOR: 350, } as const export type DescriptionContext = keyof typeof DESCRIPTION_LIMITS // ─── PII Patterns ──────────────────────────────────────────────────────────── const PII_PATTERNS = { email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g, url: /https?:\/\/[^\s]+/g, ssn: /\d{3}-\d{2}-\d{4}/g, ipv4: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g, } as const // ─── Basic Anonymization Types (Assignment Service) ────────────────────────── export interface AnonymizedJuror { anonymousId: string expertiseTags: string[] currentAssignmentCount: number maxAssignments: number | null } export interface AnonymizedProject { anonymousId: string title: string description: string | null tags: string[] teamName: string | null } export interface JurorMapping { anonymousId: string realId: string } export interface ProjectMapping { anonymousId: string realId: string } export interface AnonymizationResult { jurors: AnonymizedJuror[] projects: AnonymizedProject[] jurorMappings: JurorMapping[] projectMappings: ProjectMapping[] } // ─── Enhanced Project Types (Filtering/Awards) ─────────────────────────────── /** * Comprehensive anonymized project data for AI filtering * Includes all fields needed for flexible filtering criteria */ export interface AnonymizedProjectForAI { project_id: string // P1, P2, etc. title: string // Sanitized description: string // Truncated + PII stripped category: CompetitionCategory | null // STARTUP | BUSINESS_CONCEPT ocean_issue: OceanIssue | null // Enum value country: string | null region: string | null // geographicZone institution: string | null tags: string[] founded_year: number | null // Just the year team_size: number has_description: boolean file_count: number file_types: string[] // FileType values wants_mentorship: boolean submission_source: SubmissionSource submitted_date: string | null // YYYY-MM-DD only } /** * Project input with all relations needed for comprehensive anonymization */ export interface ProjectWithRelations { id: string title: string description?: string | null teamName?: string | null competitionCategory?: CompetitionCategory | null oceanIssue?: OceanIssue | null country?: string | null geographicZone?: string | null institution?: string | null tags: string[] foundedAt?: Date | null wantsMentorship?: boolean submissionSource: SubmissionSource submittedAt?: Date | null _count?: { teamMembers?: number files?: number } files?: Array<{ fileType: FileType | null }> } /** * Mapping for de-anonymization */ export interface ProjectAIMapping { anonymousId: string realId: string } // ─── Basic Anonymization (Assignment Service) ──────────────────────────────── interface JurorInput { id: string name?: string | null email: string expertiseTags: string[] maxAssignments?: number | null _count?: { assignments: number } } interface ProjectInput { id: string title: string description?: string | null tags: string[] teamName?: string | null } /** * Anonymize juror and project data for AI processing (Assignment service) */ export function anonymizeForAI( jurors: JurorInput[], projects: ProjectInput[] ): AnonymizationResult { const jurorMappings: JurorMapping[] = [] const projectMappings: ProjectMapping[] = [] const anonymizedJurors: AnonymizedJuror[] = jurors.map((juror, index) => { const anonymousId = `juror_${(index + 1).toString().padStart(3, '0')}` jurorMappings.push({ anonymousId, realId: juror.id, }) return { anonymousId, expertiseTags: juror.expertiseTags, currentAssignmentCount: juror._count?.assignments ?? 0, maxAssignments: juror.maxAssignments ?? null, } }) const anonymizedProjects: AnonymizedProject[] = projects.map( (project, index) => { const anonymousId = `project_${(index + 1).toString().padStart(3, '0')}` projectMappings.push({ anonymousId, realId: project.id, }) return { anonymousId, title: sanitizeText(project.title), description: project.description ? truncateAndSanitize(project.description, DESCRIPTION_LIMITS.ASSIGNMENT) : null, tags: project.tags, teamName: project.teamName ? `Team ${index + 1}` : null, } } ) return { jurors: anonymizedJurors, projects: anonymizedProjects, jurorMappings, projectMappings, } } // ─── Enhanced Anonymization (Filtering/Awards) ─────────────────────────────── /** * Anonymize a single project with comprehensive data for AI filtering * * GDPR Compliance: * - Strips team names, email references, phone numbers, URLs * - Replaces IDs with sequential anonymous IDs * - Truncates descriptions to limit data exposure * - Keeps only necessary fields for filtering criteria */ export function anonymizeProjectForAI( project: ProjectWithRelations, index: number, context: DescriptionContext = 'FILTERING' ): AnonymizedProjectForAI { const descriptionLimit = DESCRIPTION_LIMITS[context] return { project_id: `P${index + 1}`, title: sanitizeText(project.title), description: truncateAndSanitize(project.description, descriptionLimit), category: project.competitionCategory ?? null, ocean_issue: project.oceanIssue ?? null, country: project.country ?? null, region: project.geographicZone ?? null, institution: project.institution ?? null, tags: project.tags, founded_year: project.foundedAt?.getFullYear() ?? null, team_size: project._count?.teamMembers ?? 0, has_description: !!project.description?.trim(), file_count: project._count?.files ?? 0, file_types: project.files ?.map((f) => f.fileType) .filter((ft): ft is FileType => ft !== null) ?? [], wants_mentorship: project.wantsMentorship ?? false, submission_source: project.submissionSource, submitted_date: project.submittedAt?.toISOString().split('T')[0] ?? null, } } /** * Anonymize multiple projects and return mappings */ export function anonymizeProjectsForAI( projects: ProjectWithRelations[], context: DescriptionContext = 'FILTERING' ): { anonymized: AnonymizedProjectForAI[] mappings: ProjectAIMapping[] } { const mappings: ProjectAIMapping[] = [] const anonymized = projects.map((project, index) => { mappings.push({ anonymousId: `P${index + 1}`, realId: project.id, }) return anonymizeProjectForAI(project, index, context) }) return { anonymized, mappings } } // ─── De-anonymization ──────────────────────────────────────────────────────── /** * De-anonymize AI results back to real IDs */ export function deanonymizeResults< T extends { jurorId: string; projectId: string } >( results: T[], jurorMappings: JurorMapping[], projectMappings: ProjectMapping[] ): (T & { realJurorId: string; realProjectId: string })[] { const jurorMap = new Map( jurorMappings.map((m) => [m.anonymousId, m.realId]) ) const projectMap = new Map( projectMappings.map((m) => [m.anonymousId, m.realId]) ) return results.map((result) => ({ ...result, realJurorId: jurorMap.get(result.jurorId) || result.jurorId, realProjectId: projectMap.get(result.projectId) || result.projectId, })) } /** * De-anonymize project-only results (for filtering/awards) */ export function deanonymizeProjectResults( results: T[], mappings: ProjectAIMapping[] ): (T & { realProjectId: string })[] { const projectMap = new Map(mappings.map((m) => [m.anonymousId, m.realId])) return results.map((result) => ({ ...result, realProjectId: projectMap.get(result.project_id) || result.project_id, })) } // ─── Text Sanitization ─────────────────────────────────────────────────────── /** * Sanitize text to remove potential PII patterns * Removes emails, phone numbers, URLs, and other identifying information */ export function sanitizeText(text: string): string { let sanitized = text // Remove email addresses sanitized = sanitized.replace(PII_PATTERNS.email, '[email removed]') // Remove phone numbers (various formats) sanitized = sanitized.replace(PII_PATTERNS.phone, '[phone removed]') // Remove URLs sanitized = sanitized.replace(PII_PATTERNS.url, '[url removed]') // Remove SSN-like patterns sanitized = sanitized.replace(PII_PATTERNS.ssn, '[id removed]') return sanitized } /** * Truncate text to a maximum length and sanitize */ export function truncateAndSanitize( text: string | null | undefined, maxLength: number ): string { if (!text) return '' const sanitized = sanitizeText(text) if (sanitized.length <= maxLength) { return sanitized } return sanitized.slice(0, maxLength - 3) + '...' } // ─── GDPR Compliance Validation ────────────────────────────────────────────── export interface PIIValidationResult { valid: boolean violations: string[] } /** * Validate that data contains no personal information * Used for GDPR compliance before sending data to AI */ export function validateNoPersonalData( data: Record ): PIIValidationResult { const violations: string[] = [] const textContent = JSON.stringify(data) // Check each PII pattern for (const [type, pattern] of Object.entries(PII_PATTERNS)) { // Reset regex state (global flag) pattern.lastIndex = 0 if (pattern.test(textContent)) { violations.push(`Potential ${type} detected in data`) } } // Additional checks for common PII fields const sensitiveFields = [ 'email', 'phone', 'password', 'ssn', 'socialSecurity', 'creditCard', 'bankAccount', 'drivingLicense', ] const keys = Object.keys(data).map((k) => k.toLowerCase()) for (const field of sensitiveFields) { if (keys.includes(field)) { violations.push(`Sensitive field "${field}" present in data`) } } return { valid: violations.length === 0, violations, } } /** * Enforce GDPR compliance before EVERY AI call * Throws an error if PII is detected */ export function enforceGDPRCompliance(data: unknown[]): void { for (let i = 0; i < data.length; i++) { const item = data[i] if (typeof item === 'object' && item !== null) { const { valid, violations } = validateNoPersonalData( item as Record ) if (!valid) { console.error( `[GDPR] PII validation failed for item ${i}:`, violations ) throw new Error( `GDPR compliance check failed: ${violations.join(', ')}` ) } } } } /** * Validate that data has been properly anonymized * Returns true if no PII patterns are detected */ export function validateAnonymization(data: AnonymizationResult): boolean { const checkText = (text: string | null | undefined): boolean => { if (!text) return true // Reset regex state for each check for (const pattern of Object.values(PII_PATTERNS)) { pattern.lastIndex = 0 if (pattern.test(text)) return false } return true } // Check jurors for (const juror of data.jurors) { for (const tag of juror.expertiseTags) { if (!checkText(tag)) return false } } // Check projects for (const project of data.projects) { if (!checkText(project.title)) return false if (!checkText(project.description)) return false for (const tag of project.tags) { if (!checkText(tag)) return false } } return true } /** * Validate anonymized projects for AI (enhanced version) */ export function validateAnonymizedProjects( projects: AnonymizedProjectForAI[] ): boolean { const checkText = (text: string | null | undefined): boolean => { if (!text) return true for (const pattern of Object.values(PII_PATTERNS)) { pattern.lastIndex = 0 if (pattern.test(text)) return false } return true } for (const project of projects) { if (!checkText(project.title)) return false if (!checkText(project.description)) return false if (!checkText(project.institution)) return false for (const tag of project.tags) { if (!checkText(tag)) return false } } return true }