Adding model card for Zentropi (#200) · roost.tools/coop@e7c7b1a

+1

.gitignore

··· 44 44 45 45 # NCMEC report XML files (contain sensitive data) 46 46 ncmec-reports/ 47 + .devops/backups/

-6

client/src/graphql/generated.ts

··· 1307 1307 }; 1308 1308 1309 1309 export const GQLIntegration = { 1310 - Akismet: 'AKISMET', 1311 1310 GoogleContentSafetyApi: 'GOOGLE_CONTENT_SAFETY_API', 1312 - L1Ght: 'L1GHT', 1313 - MicrosoftAzureContentModerator: 'MICROSOFT_AZURE_CONTENT_MODERATOR', 1314 - Oopspam: 'OOPSPAM', 1315 1311 OpenAi: 'OPEN_AI', 1316 - SightEngine: 'SIGHT_ENGINE', 1317 - TwoHat: 'TWO_HAT', 1318 1312 Zentropi: 'ZENTROPI', 1319 1313 } as const; 1320 1314

+19

package-lock.json

··· 830 830 } 831 831 } 832 832 }, 833 + "node_modules/@graphql-codegen/cli/node_modules/@types/node": { 834 + "version": "25.5.2", 835 + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.5.2.tgz", 836 + "integrity": "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg==", 837 + "license": "MIT", 838 + "optional": true, 839 + "peer": true, 840 + "dependencies": { 841 + "undici-types": "~7.18.0" 842 + } 843 + }, 833 844 "node_modules/@graphql-codegen/cli/node_modules/cosmiconfig": { 834 845 "version": "9.0.0", 835 846 "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", ··· 5260 5271 "engines": { 5261 5272 "node": ">=0.10.0" 5262 5273 } 5274 + }, 5275 + "node_modules/undici-types": { 5276 + "version": "7.18.2", 5277 + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", 5278 + "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", 5279 + "license": "MIT", 5280 + "optional": true, 5281 + "peer": true 5263 5282 }, 5264 5283 "node_modules/universalify": { 5265 5284 "version": "2.0.0",

-6

server/graphql/generated.ts

··· 1376 1376 }; 1377 1377 1378 1378 export const GQLIntegration = { 1379 - Akismet: 'AKISMET', 1380 1379 GoogleContentSafetyApi: 'GOOGLE_CONTENT_SAFETY_API', 1381 - L1Ght: 'L1GHT', 1382 - MicrosoftAzureContentModerator: 'MICROSOFT_AZURE_CONTENT_MODERATOR', 1383 - Oopspam: 'OOPSPAM', 1384 1380 OpenAi: 'OPEN_AI', 1385 - SightEngine: 'SIGHT_ENGINE', 1386 - TwoHat: 'TWO_HAT', 1387 1381 Zentropi: 'ZENTROPI', 1388 1382 } as const; 1389 1383

-6

server/graphql/modules/integration.ts

··· 18 18 19 19 const typeDefs = /* GraphQL */ ` 20 20 enum Integration { 21 - AKISMET 22 21 GOOGLE_CONTENT_SAFETY_API 23 - L1GHT 24 - MICROSOFT_AZURE_CONTENT_MODERATOR 25 - OOPSPAM 26 22 OPEN_AI 27 - SIGHT_ENGINE 28 - TWO_HAT 29 23 ZENTROPI 30 24 } 31 25

+25 -8

server/services/integrationRegistry/integrationManifests.ts

··· 158 158 159 159 const ZENTROPI: IntegrationManifestEntry = { 160 160 modelCard: { 161 - modelName: 'Zentropi', 161 + modelName: 'Zentropi: CoPE-A-9B', 162 162 version: '1.x', 163 - releaseDate: 'Ongoing', 163 + releaseDate: 'July 20, 2025', 164 164 sections: [ 165 165 { 166 166 id: 'trainingData', 167 167 title: 'Training Data Sources', 168 - fields: [{ label: 'Data Sources', value: 'TBD' }], 168 + fields: [{ label: 'Data Sources', value: "CoPE-A's dataset includes ~60,000 labels across unique policy/content pairs using policy texts created by CoPE team and content from publicly-accessible internet forums. The CoPE team used a mix of automated and manual annotation to create golden labels. The training data includes but is not limited to hate speech, sexual content, self-harm, harassment, toxicity." }], 169 169 }, 170 170 { 171 171 id: 'policyAndTaxonomy', 172 172 title: 'Policy & Taxonomy Definitions', 173 - fields: [{ label: 'Policies', value: 'TBD' }], 173 + fields: [{ label: 'Policies', value: "No fixed taxonomy. CoPE-A is policy-adaptive and steerable by users who define custom criteria for their specific use case. Trained for generalizable policy understanding across diverse policy formulations." }], 174 174 }, 175 175 { 176 176 id: 'annotationMethodology', 177 177 title: 'Annotation Methodology', 178 - fields: [{ label: 'Methodology', value: 'TBD' }], 178 + fields: [{ label: 'Methodology', value: 'CoPE-A was trained using a novel training methodology that moves beyond policy memorization to achieve true policy interpretation. Trained across conflicting policy formulations with focus on generalizable policy understanding and interpretation consistency. Combined automated and manual labeling processes for quality assurance.' }], 179 179 }, 180 180 { 181 181 id: 'performanceBenchmarks', 182 182 title: 'Performance Benchmarks', 183 - fields: [{ label: 'Benchmarks', value: 'TBD' }], 183 + fields: [{ label: 'Benchmarks', value: 'Tested on policies and content never seen during training. High accuracy across all content types: Hate Speech 91% accurate (internal test), 84% accurate (public Ethos test); Inappropriate Sexual Content 89%; Toxic Speech 90%; Self-Harm 88%; Harassment 73%. Outperforms comparable models including GPT-4o, Llama-3.1-8B, LlamaGuard3-8B, and ShieldGemma-9B across most categories.' }], 184 184 }, 185 185 { 186 186 id: 'biasAndLimitations', 187 187 title: 'Bias Documentation & Known Limits', 188 - fields: [{ label: 'Known Limitations', value: 'TBD' }], 188 + fields: [{ label: 'Known Limitations', value: 'Text processing is limited to 8K tokens. Optimized for US English only; performance degrades for other languages/locales. Binary classification only (label present/absent). Cannot classify content requiring external verification unless explicitly defined in policy. Requires careful policy design to mitigate potential biases. Users should monitor classification patterns across demographic groups and audit decisions regularly.' }], 189 189 }, 190 190 { 191 191 id: 'implementationGuidance', ··· 194 194 { 195 195 label: 'Credentials', 196 196 value: 197 - 'API Key plus optional Labeler Versions (id and label per version).', 197 + 'API Key plus optional Labeler Versions (id and label per version) created by the user.', 198 + }, 199 + { 200 + label: 'Input Format', 201 + value: 202 + 'Use Zentropi to create and test labeling policies and classifiers. The input format should include: (1) Overview of policy subject, (2) Definition of Terms (precise definitions of words/phrases), (3) Interpretation of Language (guidance on ambiguous language), (4) Definition of Labels with Includes/Excludes criteria. CoPE-A then returns binary classification (0/1) indicating if content matches any policy labels.', 198 203 }, 199 204 ], 200 205 }, ··· 202 207 id: 'relevantLinks', 203 208 title: 'Relevant Links', 204 209 fields: [ 210 + { 211 + label: 'HuggingFace Model Card', 212 + value: 'https://huggingface.co/zentropi-ai/cope-a-9b', 213 + }, 205 214 { 206 215 label: 'Documentation', 207 216 value: 'https://docs.zentropi.ai', 217 + }, 218 + { 219 + label: 'Research Talk', 220 + value: 'https://www.youtube.com/live/JMq49FZ5qmY?si=Q6qpHNeTo-Bc6t9a&t=1', 221 + }, 222 + { 223 + label: 'Sample Code Notebook', 224 + value: 'https://colab.research.google.com/drive/1LBmQ3d0OVrq2EpVP0tc03POalf3sDpjl?usp=sharing', 208 225 }, 209 226 ], 210 227 },

Configure Feed

Configure Feed