MOPC-App/src/server/services/anonymization.ts

538 lines
15 KiB
TypeScript

/**
* Data Anonymization Service
*
* Strips PII (names, emails, etc.) from data before sending to AI services.
* Returns ID mappings for de-anonymization of results.
*
* GDPR Compliance:
* - All personal identifiers are stripped before AI processing
* - Project/user IDs are replaced with sequential anonymous IDs
* - Text content is sanitized to remove emails, phones, URLs
* - Validation ensures no PII leakage before each AI call
*/
import type {
CompetitionCategory,
OceanIssue,
FileType,
SubmissionSource,
} from '@prisma/client'
// ─── Description Limits ──────────────────────────────────────────────────────
export const DESCRIPTION_LIMITS = {
ASSIGNMENT: 300,
FILTERING: 500,
ELIGIBILITY: 400,
MENTOR: 350,
} as const
export type DescriptionContext = keyof typeof DESCRIPTION_LIMITS
// ─── PII Patterns ────────────────────────────────────────────────────────────
const PII_PATTERNS = {
email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
url: /https?:\/\/[^\s]+/g,
ssn: /\d{3}-\d{2}-\d{4}/g,
ipv4: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
} as const
// ─── Basic Anonymization Types (Assignment Service) ──────────────────────────
export interface AnonymizedJuror {
anonymousId: string
expertiseTags: string[]
currentAssignmentCount: number
maxAssignments: number | null
}
export interface AnonymizedProject {
anonymousId: string
title: string
description: string | null
tags: string[]
teamName: string | null
}
export interface JurorMapping {
anonymousId: string
realId: string
}
export interface ProjectMapping {
anonymousId: string
realId: string
}
export interface AnonymizationResult {
jurors: AnonymizedJuror[]
projects: AnonymizedProject[]
jurorMappings: JurorMapping[]
projectMappings: ProjectMapping[]
}
// ─── Enhanced Project Types (Filtering/Awards) ───────────────────────────────
/**
* Comprehensive anonymized project data for AI filtering
* Includes all fields needed for flexible filtering criteria
*/
export interface AnonymizedProjectForAI {
project_id: string // P1, P2, etc.
title: string // Sanitized
description: string // Truncated + PII stripped
category: CompetitionCategory | null // STARTUP | BUSINESS_CONCEPT
ocean_issue: OceanIssue | null // Enum value
country: string | null
region: string | null // geographicZone
institution: string | null
tags: string[]
founded_year: number | null // Just the year
team_size: number
has_description: boolean
file_count: number
file_types: string[] // FileType values
wants_mentorship: boolean
submission_source: SubmissionSource
submitted_date: string | null // YYYY-MM-DD only
}
/**
* Project input with all relations needed for comprehensive anonymization
*/
export interface ProjectWithRelations {
id: string
title: string
description?: string | null
teamName?: string | null
competitionCategory?: CompetitionCategory | null
oceanIssue?: OceanIssue | null
country?: string | null
geographicZone?: string | null
institution?: string | null
tags: string[]
foundedAt?: Date | null
wantsMentorship?: boolean
submissionSource: SubmissionSource
submittedAt?: Date | null
_count?: {
teamMembers?: number
files?: number
}
files?: Array<{ fileType: FileType | null }>
}
/**
* Mapping for de-anonymization
*/
export interface ProjectAIMapping {
anonymousId: string
realId: string
}
// ─── Project Conversion Helper ──────────────────────────────────────────────
/**
* Convert a loosely-typed Prisma project result to ProjectWithRelations.
* Used by ai-tagging, ai-filtering, and ai-award-eligibility services.
*/
export function toProjectWithRelations(project: {
id: string
title: string
description?: string | null
competitionCategory?: string | null
oceanIssue?: string | null
country?: string | null
geographicZone?: string | null
institution?: string | null
tags: string[]
foundedAt?: Date | null
wantsMentorship?: boolean | null
submissionSource?: string
submittedAt?: Date | null
_count?: { teamMembers?: number; files?: number }
files?: Array<{ fileType?: string | null; [key: string]: unknown }>
}): ProjectWithRelations {
return {
id: project.id,
title: project.title,
description: project.description,
competitionCategory: project.competitionCategory as ProjectWithRelations['competitionCategory'],
oceanIssue: project.oceanIssue as ProjectWithRelations['oceanIssue'],
country: project.country,
geographicZone: project.geographicZone,
institution: project.institution,
tags: project.tags,
foundedAt: project.foundedAt,
wantsMentorship: project.wantsMentorship ?? false,
submissionSource: (project.submissionSource as ProjectWithRelations['submissionSource']) ?? 'MANUAL',
submittedAt: project.submittedAt,
_count: {
teamMembers: project._count?.teamMembers ?? 0,
files: project._count?.files ?? project.files?.length ?? 0,
},
files: project.files?.map((f) => ({ fileType: (f.fileType as FileType) ?? null })) ?? [],
}
}
// ─── Basic Anonymization (Assignment Service) ────────────────────────────────
interface JurorInput {
id: string
name?: string | null
email: string
expertiseTags: string[]
maxAssignments?: number | null
_count?: {
assignments: number
}
}
interface ProjectInput {
id: string
title: string
description?: string | null
tags: string[]
teamName?: string | null
}
/**
* Anonymize juror and project data for AI processing (Assignment service)
*/
export function anonymizeForAI(
jurors: JurorInput[],
projects: ProjectInput[]
): AnonymizationResult {
const jurorMappings: JurorMapping[] = []
const projectMappings: ProjectMapping[] = []
const anonymizedJurors: AnonymizedJuror[] = jurors.map((juror, index) => {
const anonymousId = `juror_${(index + 1).toString().padStart(3, '0')}`
jurorMappings.push({
anonymousId,
realId: juror.id,
})
return {
anonymousId,
expertiseTags: juror.expertiseTags,
currentAssignmentCount: juror._count?.assignments ?? 0,
maxAssignments: juror.maxAssignments ?? null,
}
})
const anonymizedProjects: AnonymizedProject[] = projects.map(
(project, index) => {
const anonymousId = `project_${(index + 1).toString().padStart(3, '0')}`
projectMappings.push({
anonymousId,
realId: project.id,
})
return {
anonymousId,
title: sanitizeText(project.title),
description: project.description
? truncateAndSanitize(project.description, DESCRIPTION_LIMITS.ASSIGNMENT)
: null,
tags: project.tags,
teamName: project.teamName ? `Team ${index + 1}` : null,
}
}
)
return {
jurors: anonymizedJurors,
projects: anonymizedProjects,
jurorMappings,
projectMappings,
}
}
// ─── Enhanced Anonymization (Filtering/Awards) ───────────────────────────────
/**
* Anonymize a single project with comprehensive data for AI filtering
*
* GDPR Compliance:
* - Strips team names, email references, phone numbers, URLs
* - Replaces IDs with sequential anonymous IDs
* - Truncates descriptions to limit data exposure
* - Keeps only necessary fields for filtering criteria
*/
export function anonymizeProjectForAI(
project: ProjectWithRelations,
index: number,
context: DescriptionContext = 'FILTERING'
): AnonymizedProjectForAI {
const descriptionLimit = DESCRIPTION_LIMITS[context]
return {
project_id: `P${index + 1}`,
title: sanitizeText(project.title),
description: truncateAndSanitize(project.description, descriptionLimit),
category: project.competitionCategory ?? null,
ocean_issue: project.oceanIssue ?? null,
country: project.country ?? null,
region: project.geographicZone ?? null,
institution: project.institution ?? null,
tags: project.tags,
founded_year: project.foundedAt?.getFullYear() ?? null,
team_size: project._count?.teamMembers ?? 0,
has_description: !!project.description?.trim(),
file_count: project._count?.files ?? 0,
file_types: project.files
?.map((f) => f.fileType)
.filter((ft): ft is FileType => ft !== null) ?? [],
wants_mentorship: project.wantsMentorship ?? false,
submission_source: project.submissionSource,
submitted_date: project.submittedAt?.toISOString().split('T')[0] ?? null,
}
}
/**
* Anonymize multiple projects and return mappings
*/
export function anonymizeProjectsForAI(
projects: ProjectWithRelations[],
context: DescriptionContext = 'FILTERING'
): {
anonymized: AnonymizedProjectForAI[]
mappings: ProjectAIMapping[]
} {
const mappings: ProjectAIMapping[] = []
const anonymized = projects.map((project, index) => {
mappings.push({
anonymousId: `P${index + 1}`,
realId: project.id,
})
return anonymizeProjectForAI(project, index, context)
})
return { anonymized, mappings }
}
// ─── De-anonymization ────────────────────────────────────────────────────────
/**
* De-anonymize AI results back to real IDs
*/
export function deanonymizeResults<
T extends { jurorId: string; projectId: string }
>(
results: T[],
jurorMappings: JurorMapping[],
projectMappings: ProjectMapping[]
): (T & { realJurorId: string; realProjectId: string })[] {
const jurorMap = new Map(
jurorMappings.map((m) => [m.anonymousId, m.realId])
)
const projectMap = new Map(
projectMappings.map((m) => [m.anonymousId, m.realId])
)
return results.map((result) => ({
...result,
realJurorId: jurorMap.get(result.jurorId) || result.jurorId,
realProjectId: projectMap.get(result.projectId) || result.projectId,
}))
}
/**
* De-anonymize project-only results (for filtering/awards)
*/
export function deanonymizeProjectResults<T extends { project_id: string }>(
results: T[],
mappings: ProjectAIMapping[]
): (T & { realProjectId: string })[] {
const projectMap = new Map(mappings.map((m) => [m.anonymousId, m.realId]))
return results.map((result) => ({
...result,
realProjectId: projectMap.get(result.project_id) || result.project_id,
}))
}
// ─── Text Sanitization ───────────────────────────────────────────────────────
/**
* Sanitize text to remove potential PII patterns
* Removes emails, phone numbers, URLs, and other identifying information
*/
export function sanitizeText(text: string): string {
let sanitized = text
// Remove email addresses
sanitized = sanitized.replace(PII_PATTERNS.email, '[email removed]')
// Remove phone numbers (various formats)
sanitized = sanitized.replace(PII_PATTERNS.phone, '[phone removed]')
// Remove URLs
sanitized = sanitized.replace(PII_PATTERNS.url, '[url removed]')
// Remove SSN-like patterns
sanitized = sanitized.replace(PII_PATTERNS.ssn, '[id removed]')
return sanitized
}
/**
* Truncate text to a maximum length and sanitize
*/
export function truncateAndSanitize(
text: string | null | undefined,
maxLength: number
): string {
if (!text) return ''
const sanitized = sanitizeText(text)
if (sanitized.length <= maxLength) {
return sanitized
}
return sanitized.slice(0, maxLength - 3) + '...'
}
// ─── GDPR Compliance Validation ──────────────────────────────────────────────
export interface PIIValidationResult {
valid: boolean
violations: string[]
}
/**
* Validate that data contains no personal information
* Used for GDPR compliance before sending data to AI
*/
export function validateNoPersonalData(
data: Record<string, unknown>
): PIIValidationResult {
const violations: string[] = []
const textContent = JSON.stringify(data)
// Check each PII pattern
for (const [type, pattern] of Object.entries(PII_PATTERNS)) {
// Reset regex state (global flag)
pattern.lastIndex = 0
if (pattern.test(textContent)) {
violations.push(`Potential ${type} detected in data`)
}
}
// Additional checks for common PII fields
const sensitiveFields = [
'email',
'phone',
'password',
'ssn',
'socialSecurity',
'creditCard',
'bankAccount',
'drivingLicense',
]
const keys = Object.keys(data).map((k) => k.toLowerCase())
for (const field of sensitiveFields) {
if (keys.includes(field)) {
violations.push(`Sensitive field "${field}" present in data`)
}
}
return {
valid: violations.length === 0,
violations,
}
}
/**
* Enforce GDPR compliance before EVERY AI call
* Throws an error if PII is detected
*/
export function enforceGDPRCompliance(data: unknown[]): void {
for (let i = 0; i < data.length; i++) {
const item = data[i]
if (typeof item === 'object' && item !== null) {
const { valid, violations } = validateNoPersonalData(
item as Record<string, unknown>
)
if (!valid) {
console.error(
`[GDPR] PII validation failed for item ${i}:`,
violations
)
throw new Error(
`GDPR compliance check failed: ${violations.join(', ')}`
)
}
}
}
}
/**
* Validate that data has been properly anonymized
* Returns true if no PII patterns are detected
*/
export function validateAnonymization(data: AnonymizationResult): boolean {
const checkText = (text: string | null | undefined): boolean => {
if (!text) return true
// Reset regex state for each check
for (const pattern of Object.values(PII_PATTERNS)) {
pattern.lastIndex = 0
if (pattern.test(text)) return false
}
return true
}
// Check jurors
for (const juror of data.jurors) {
for (const tag of juror.expertiseTags) {
if (!checkText(tag)) return false
}
}
// Check projects
for (const project of data.projects) {
if (!checkText(project.title)) return false
if (!checkText(project.description)) return false
for (const tag of project.tags) {
if (!checkText(tag)) return false
}
}
return true
}
/**
* Validate anonymized projects for AI (enhanced version)
*/
export function validateAnonymizedProjects(
projects: AnonymizedProjectForAI[]
): boolean {
const checkText = (text: string | null | undefined): boolean => {
if (!text) return true
for (const pattern of Object.values(PII_PATTERNS)) {
pattern.lastIndex = 0
if (pattern.test(text)) return false
}
return true
}
for (const project of projects) {
if (!checkText(project.title)) return false
if (!checkText(project.description)) return false
if (!checkText(project.institution)) return false
for (const tag of project.tags) {
if (!checkText(tag)) return false
}
}
return true
}