538 lines
15 KiB
TypeScript
538 lines
15 KiB
TypeScript
/**
|
|
* Data Anonymization Service
|
|
*
|
|
* Strips PII (names, emails, etc.) from data before sending to AI services.
|
|
* Returns ID mappings for de-anonymization of results.
|
|
*
|
|
* GDPR Compliance:
|
|
* - All personal identifiers are stripped before AI processing
|
|
* - Project/user IDs are replaced with sequential anonymous IDs
|
|
* - Text content is sanitized to remove emails, phones, URLs
|
|
* - Validation ensures no PII leakage before each AI call
|
|
*/
|
|
|
|
import type {
|
|
CompetitionCategory,
|
|
OceanIssue,
|
|
FileType,
|
|
SubmissionSource,
|
|
} from '@prisma/client'
|
|
|
|
// ─── Description Limits ──────────────────────────────────────────────────────
|
|
|
|
export const DESCRIPTION_LIMITS = {
|
|
ASSIGNMENT: 300,
|
|
FILTERING: 500,
|
|
ELIGIBILITY: 400,
|
|
MENTOR: 350,
|
|
} as const
|
|
|
|
export type DescriptionContext = keyof typeof DESCRIPTION_LIMITS
|
|
|
|
// ─── PII Patterns ────────────────────────────────────────────────────────────
|
|
|
|
const PII_PATTERNS = {
|
|
email: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
|
|
phone: /(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g,
|
|
url: /https?:\/\/[^\s]+/g,
|
|
ssn: /\d{3}-\d{2}-\d{4}/g,
|
|
ipv4: /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
|
|
} as const
|
|
|
|
// ─── Basic Anonymization Types (Assignment Service) ──────────────────────────
|
|
|
|
export interface AnonymizedJuror {
|
|
anonymousId: string
|
|
expertiseTags: string[]
|
|
currentAssignmentCount: number
|
|
maxAssignments: number | null
|
|
}
|
|
|
|
export interface AnonymizedProject {
|
|
anonymousId: string
|
|
title: string
|
|
description: string | null
|
|
tags: string[]
|
|
teamName: string | null
|
|
}
|
|
|
|
export interface JurorMapping {
|
|
anonymousId: string
|
|
realId: string
|
|
}
|
|
|
|
export interface ProjectMapping {
|
|
anonymousId: string
|
|
realId: string
|
|
}
|
|
|
|
export interface AnonymizationResult {
|
|
jurors: AnonymizedJuror[]
|
|
projects: AnonymizedProject[]
|
|
jurorMappings: JurorMapping[]
|
|
projectMappings: ProjectMapping[]
|
|
}
|
|
|
|
// ─── Enhanced Project Types (Filtering/Awards) ───────────────────────────────
|
|
|
|
/**
|
|
* Comprehensive anonymized project data for AI filtering
|
|
* Includes all fields needed for flexible filtering criteria
|
|
*/
|
|
export interface AnonymizedProjectForAI {
|
|
project_id: string // P1, P2, etc.
|
|
title: string // Sanitized
|
|
description: string // Truncated + PII stripped
|
|
category: CompetitionCategory | null // STARTUP | BUSINESS_CONCEPT
|
|
ocean_issue: OceanIssue | null // Enum value
|
|
country: string | null
|
|
region: string | null // geographicZone
|
|
institution: string | null
|
|
tags: string[]
|
|
founded_year: number | null // Just the year
|
|
team_size: number
|
|
has_description: boolean
|
|
file_count: number
|
|
file_types: string[] // FileType values
|
|
wants_mentorship: boolean
|
|
submission_source: SubmissionSource
|
|
submitted_date: string | null // YYYY-MM-DD only
|
|
}
|
|
|
|
/**
|
|
* Project input with all relations needed for comprehensive anonymization
|
|
*/
|
|
export interface ProjectWithRelations {
|
|
id: string
|
|
title: string
|
|
description?: string | null
|
|
teamName?: string | null
|
|
competitionCategory?: CompetitionCategory | null
|
|
oceanIssue?: OceanIssue | null
|
|
country?: string | null
|
|
geographicZone?: string | null
|
|
institution?: string | null
|
|
tags: string[]
|
|
foundedAt?: Date | null
|
|
wantsMentorship?: boolean
|
|
submissionSource: SubmissionSource
|
|
submittedAt?: Date | null
|
|
_count?: {
|
|
teamMembers?: number
|
|
files?: number
|
|
}
|
|
files?: Array<{ fileType: FileType | null }>
|
|
}
|
|
|
|
/**
|
|
* Mapping for de-anonymization
|
|
*/
|
|
export interface ProjectAIMapping {
|
|
anonymousId: string
|
|
realId: string
|
|
}
|
|
|
|
// ─── Project Conversion Helper ──────────────────────────────────────────────
|
|
|
|
/**
|
|
* Convert a loosely-typed Prisma project result to ProjectWithRelations.
|
|
* Used by ai-tagging, ai-filtering, and ai-award-eligibility services.
|
|
*/
|
|
export function toProjectWithRelations(project: {
|
|
id: string
|
|
title: string
|
|
description?: string | null
|
|
competitionCategory?: string | null
|
|
oceanIssue?: string | null
|
|
country?: string | null
|
|
geographicZone?: string | null
|
|
institution?: string | null
|
|
tags: string[]
|
|
foundedAt?: Date | null
|
|
wantsMentorship?: boolean | null
|
|
submissionSource?: string
|
|
submittedAt?: Date | null
|
|
_count?: { teamMembers?: number; files?: number }
|
|
files?: Array<{ fileType?: string | null; [key: string]: unknown }>
|
|
}): ProjectWithRelations {
|
|
return {
|
|
id: project.id,
|
|
title: project.title,
|
|
description: project.description,
|
|
competitionCategory: project.competitionCategory as ProjectWithRelations['competitionCategory'],
|
|
oceanIssue: project.oceanIssue as ProjectWithRelations['oceanIssue'],
|
|
country: project.country,
|
|
geographicZone: project.geographicZone,
|
|
institution: project.institution,
|
|
tags: project.tags,
|
|
foundedAt: project.foundedAt,
|
|
wantsMentorship: project.wantsMentorship ?? false,
|
|
submissionSource: (project.submissionSource as ProjectWithRelations['submissionSource']) ?? 'MANUAL',
|
|
submittedAt: project.submittedAt,
|
|
_count: {
|
|
teamMembers: project._count?.teamMembers ?? 0,
|
|
files: project._count?.files ?? project.files?.length ?? 0,
|
|
},
|
|
files: project.files?.map((f) => ({ fileType: (f.fileType as FileType) ?? null })) ?? [],
|
|
}
|
|
}
|
|
|
|
// ─── Basic Anonymization (Assignment Service) ────────────────────────────────
|
|
|
|
interface JurorInput {
|
|
id: string
|
|
name?: string | null
|
|
email: string
|
|
expertiseTags: string[]
|
|
maxAssignments?: number | null
|
|
_count?: {
|
|
assignments: number
|
|
}
|
|
}
|
|
|
|
interface ProjectInput {
|
|
id: string
|
|
title: string
|
|
description?: string | null
|
|
tags: string[]
|
|
teamName?: string | null
|
|
}
|
|
|
|
/**
|
|
* Anonymize juror and project data for AI processing (Assignment service)
|
|
*/
|
|
export function anonymizeForAI(
|
|
jurors: JurorInput[],
|
|
projects: ProjectInput[]
|
|
): AnonymizationResult {
|
|
const jurorMappings: JurorMapping[] = []
|
|
const projectMappings: ProjectMapping[] = []
|
|
|
|
const anonymizedJurors: AnonymizedJuror[] = jurors.map((juror, index) => {
|
|
const anonymousId = `juror_${(index + 1).toString().padStart(3, '0')}`
|
|
|
|
jurorMappings.push({
|
|
anonymousId,
|
|
realId: juror.id,
|
|
})
|
|
|
|
return {
|
|
anonymousId,
|
|
expertiseTags: juror.expertiseTags,
|
|
currentAssignmentCount: juror._count?.assignments ?? 0,
|
|
maxAssignments: juror.maxAssignments ?? null,
|
|
}
|
|
})
|
|
|
|
const anonymizedProjects: AnonymizedProject[] = projects.map(
|
|
(project, index) => {
|
|
const anonymousId = `project_${(index + 1).toString().padStart(3, '0')}`
|
|
|
|
projectMappings.push({
|
|
anonymousId,
|
|
realId: project.id,
|
|
})
|
|
|
|
return {
|
|
anonymousId,
|
|
title: sanitizeText(project.title),
|
|
description: project.description
|
|
? truncateAndSanitize(project.description, DESCRIPTION_LIMITS.ASSIGNMENT)
|
|
: null,
|
|
tags: project.tags,
|
|
teamName: project.teamName ? `Team ${index + 1}` : null,
|
|
}
|
|
}
|
|
)
|
|
|
|
return {
|
|
jurors: anonymizedJurors,
|
|
projects: anonymizedProjects,
|
|
jurorMappings,
|
|
projectMappings,
|
|
}
|
|
}
|
|
|
|
// ─── Enhanced Anonymization (Filtering/Awards) ───────────────────────────────
|
|
|
|
/**
|
|
* Anonymize a single project with comprehensive data for AI filtering
|
|
*
|
|
* GDPR Compliance:
|
|
* - Strips team names, email references, phone numbers, URLs
|
|
* - Replaces IDs with sequential anonymous IDs
|
|
* - Truncates descriptions to limit data exposure
|
|
* - Keeps only necessary fields for filtering criteria
|
|
*/
|
|
export function anonymizeProjectForAI(
|
|
project: ProjectWithRelations,
|
|
index: number,
|
|
context: DescriptionContext = 'FILTERING'
|
|
): AnonymizedProjectForAI {
|
|
const descriptionLimit = DESCRIPTION_LIMITS[context]
|
|
|
|
return {
|
|
project_id: `P${index + 1}`,
|
|
title: sanitizeText(project.title),
|
|
description: truncateAndSanitize(project.description, descriptionLimit),
|
|
category: project.competitionCategory ?? null,
|
|
ocean_issue: project.oceanIssue ?? null,
|
|
country: project.country ?? null,
|
|
region: project.geographicZone ?? null,
|
|
institution: project.institution ?? null,
|
|
tags: project.tags,
|
|
founded_year: project.foundedAt?.getFullYear() ?? null,
|
|
team_size: project._count?.teamMembers ?? 0,
|
|
has_description: !!project.description?.trim(),
|
|
file_count: project._count?.files ?? 0,
|
|
file_types: project.files
|
|
?.map((f) => f.fileType)
|
|
.filter((ft): ft is FileType => ft !== null) ?? [],
|
|
wants_mentorship: project.wantsMentorship ?? false,
|
|
submission_source: project.submissionSource,
|
|
submitted_date: project.submittedAt?.toISOString().split('T')[0] ?? null,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Anonymize multiple projects and return mappings
|
|
*/
|
|
export function anonymizeProjectsForAI(
|
|
projects: ProjectWithRelations[],
|
|
context: DescriptionContext = 'FILTERING'
|
|
): {
|
|
anonymized: AnonymizedProjectForAI[]
|
|
mappings: ProjectAIMapping[]
|
|
} {
|
|
const mappings: ProjectAIMapping[] = []
|
|
const anonymized = projects.map((project, index) => {
|
|
mappings.push({
|
|
anonymousId: `P${index + 1}`,
|
|
realId: project.id,
|
|
})
|
|
return anonymizeProjectForAI(project, index, context)
|
|
})
|
|
|
|
return { anonymized, mappings }
|
|
}
|
|
|
|
// ─── De-anonymization ────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* De-anonymize AI results back to real IDs
|
|
*/
|
|
export function deanonymizeResults<
|
|
T extends { jurorId: string; projectId: string }
|
|
>(
|
|
results: T[],
|
|
jurorMappings: JurorMapping[],
|
|
projectMappings: ProjectMapping[]
|
|
): (T & { realJurorId: string; realProjectId: string })[] {
|
|
const jurorMap = new Map(
|
|
jurorMappings.map((m) => [m.anonymousId, m.realId])
|
|
)
|
|
const projectMap = new Map(
|
|
projectMappings.map((m) => [m.anonymousId, m.realId])
|
|
)
|
|
|
|
return results.map((result) => ({
|
|
...result,
|
|
realJurorId: jurorMap.get(result.jurorId) || result.jurorId,
|
|
realProjectId: projectMap.get(result.projectId) || result.projectId,
|
|
}))
|
|
}
|
|
|
|
/**
|
|
* De-anonymize project-only results (for filtering/awards)
|
|
*/
|
|
export function deanonymizeProjectResults<T extends { project_id: string }>(
|
|
results: T[],
|
|
mappings: ProjectAIMapping[]
|
|
): (T & { realProjectId: string })[] {
|
|
const projectMap = new Map(mappings.map((m) => [m.anonymousId, m.realId]))
|
|
|
|
return results.map((result) => ({
|
|
...result,
|
|
realProjectId: projectMap.get(result.project_id) || result.project_id,
|
|
}))
|
|
}
|
|
|
|
// ─── Text Sanitization ───────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Sanitize text to remove potential PII patterns
|
|
* Removes emails, phone numbers, URLs, and other identifying information
|
|
*/
|
|
export function sanitizeText(text: string): string {
|
|
let sanitized = text
|
|
|
|
// Remove email addresses
|
|
sanitized = sanitized.replace(PII_PATTERNS.email, '[email removed]')
|
|
|
|
// Remove phone numbers (various formats)
|
|
sanitized = sanitized.replace(PII_PATTERNS.phone, '[phone removed]')
|
|
|
|
// Remove URLs
|
|
sanitized = sanitized.replace(PII_PATTERNS.url, '[url removed]')
|
|
|
|
// Remove SSN-like patterns
|
|
sanitized = sanitized.replace(PII_PATTERNS.ssn, '[id removed]')
|
|
|
|
return sanitized
|
|
}
|
|
|
|
/**
|
|
* Truncate text to a maximum length and sanitize
|
|
*/
|
|
export function truncateAndSanitize(
|
|
text: string | null | undefined,
|
|
maxLength: number
|
|
): string {
|
|
if (!text) return ''
|
|
|
|
const sanitized = sanitizeText(text)
|
|
|
|
if (sanitized.length <= maxLength) {
|
|
return sanitized
|
|
}
|
|
|
|
return sanitized.slice(0, maxLength - 3) + '...'
|
|
}
|
|
|
|
// ─── GDPR Compliance Validation ──────────────────────────────────────────────
|
|
|
|
export interface PIIValidationResult {
|
|
valid: boolean
|
|
violations: string[]
|
|
}
|
|
|
|
/**
|
|
* Validate that data contains no personal information
|
|
* Used for GDPR compliance before sending data to AI
|
|
*/
|
|
export function validateNoPersonalData(
|
|
data: Record<string, unknown>
|
|
): PIIValidationResult {
|
|
const violations: string[] = []
|
|
const textContent = JSON.stringify(data)
|
|
|
|
// Check each PII pattern
|
|
for (const [type, pattern] of Object.entries(PII_PATTERNS)) {
|
|
// Reset regex state (global flag)
|
|
pattern.lastIndex = 0
|
|
|
|
if (pattern.test(textContent)) {
|
|
violations.push(`Potential ${type} detected in data`)
|
|
}
|
|
}
|
|
|
|
// Additional checks for common PII fields
|
|
const sensitiveFields = [
|
|
'email',
|
|
'phone',
|
|
'password',
|
|
'ssn',
|
|
'socialSecurity',
|
|
'creditCard',
|
|
'bankAccount',
|
|
'drivingLicense',
|
|
]
|
|
|
|
const keys = Object.keys(data).map((k) => k.toLowerCase())
|
|
for (const field of sensitiveFields) {
|
|
if (keys.includes(field)) {
|
|
violations.push(`Sensitive field "${field}" present in data`)
|
|
}
|
|
}
|
|
|
|
return {
|
|
valid: violations.length === 0,
|
|
violations,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Enforce GDPR compliance before EVERY AI call
|
|
* Throws an error if PII is detected
|
|
*/
|
|
export function enforceGDPRCompliance(data: unknown[]): void {
|
|
for (let i = 0; i < data.length; i++) {
|
|
const item = data[i]
|
|
if (typeof item === 'object' && item !== null) {
|
|
const { valid, violations } = validateNoPersonalData(
|
|
item as Record<string, unknown>
|
|
)
|
|
if (!valid) {
|
|
console.error(
|
|
`[GDPR] PII validation failed for item ${i}:`,
|
|
violations
|
|
)
|
|
throw new Error(
|
|
`GDPR compliance check failed: ${violations.join(', ')}`
|
|
)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Validate that data has been properly anonymized
|
|
* Returns true if no PII patterns are detected
|
|
*/
|
|
export function validateAnonymization(data: AnonymizationResult): boolean {
|
|
const checkText = (text: string | null | undefined): boolean => {
|
|
if (!text) return true
|
|
// Reset regex state for each check
|
|
for (const pattern of Object.values(PII_PATTERNS)) {
|
|
pattern.lastIndex = 0
|
|
if (pattern.test(text)) return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Check jurors
|
|
for (const juror of data.jurors) {
|
|
for (const tag of juror.expertiseTags) {
|
|
if (!checkText(tag)) return false
|
|
}
|
|
}
|
|
|
|
// Check projects
|
|
for (const project of data.projects) {
|
|
if (!checkText(project.title)) return false
|
|
if (!checkText(project.description)) return false
|
|
for (const tag of project.tags) {
|
|
if (!checkText(tag)) return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
/**
|
|
* Validate anonymized projects for AI (enhanced version)
|
|
*/
|
|
export function validateAnonymizedProjects(
|
|
projects: AnonymizedProjectForAI[]
|
|
): boolean {
|
|
const checkText = (text: string | null | undefined): boolean => {
|
|
if (!text) return true
|
|
for (const pattern of Object.values(PII_PATTERNS)) {
|
|
pattern.lastIndex = 0
|
|
if (pattern.test(text)) return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
for (const project of projects) {
|
|
if (!checkText(project.title)) return false
|
|
if (!checkText(project.description)) return false
|
|
if (!checkText(project.institution)) return false
|
|
for (const tag of project.tags) {
|
|
if (!checkText(tag)) return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|