{"schemaVersion":"hugging-bay.catalog-sources.v1","generatedAt":"2026-07-01T14:14:45.890Z","policy":"Sources are metadata-first unless mirrorPolicy and source-specific review explicitly allow file hosting.","totalSources":14,"rows":[{"id":"huggingface","name":"Hugging Face Hub","status":"ready-planned","artifactTypes":["model","dataset","space","collection","paper"],"categories":["models","datasets","spaces","collections","papers"],"ingestMode":"api-paginated","metadataUse":"primary","metadataLicense":"source-metadata-with-attribution","termsUrl":"https://huggingface.co/terms-of-service","apiDocs":["https://huggingface.co/docs/hub/en/api","https://huggingface.co/docs/huggingface_hub/en/guides/search","https://huggingface.co/docs/hub/en/rate-limits"],"formats":["json","repo-card","siblings"],"rateLimitPolicy":"Use HF_TOKEN when available, follow Link cursors, honor 429 and RateLimit headers, retry transient 5xx with jitter.","licenseExtractor":"cardData.license and license:* tags, normalized to policy allowlists.","provenanceFields":["repoType","repoId","repoUrl","revision","sha","fetchedAt","licenseRaw","gated","private","disabled"],"dedupeKeys":["huggingface:{repoType}:{repoId}","sourceUrl"],"skipRules":["gated","private","disabled","missing-metadata","unknown-license-for-public-index"],"refreshCadence":"daily for top packs, weekly for long tail","mirrorAllowed":true,"mirrorPolicy":"selective-reviewed-only","notes":"Best first source for broad metadata. Keep files external until a mirror review approves specific paths."},{"id":"github","name":"GitHub public repositories","status":"planned","artifactTypes":["app","tool","agent","library"],"categories":["apps","tools","agents","libraries","model-serving","evals"],"ingestMode":"api-search","metadataUse":"candidate","metadataLicense":"GitHub API metadata with source attribution","termsUrl":"https://docs.github.com/en/site-policy/github-terms/github-terms-of-service","apiDocs":["https://docs.github.com/en/rest/search/search","https://docs.github.com/en/rest/licenses/licenses"],"formats":["json","license","topics","releases"],"rateLimitPolicy":"Use authenticated API tokens, respect REST/GraphQL rate limits, and query by curated topics.","licenseExtractor":"license.spdx_id plus repository license file detection where available.","provenanceFields":["node_id","full_name","html_url","default_branch","pushed_at","license","topics"],"dedupeKeys":["github:{owner}/{repo}","githubNode:{node_id}","sourceUrl"],"skipRules":["archived-without-release","no-license","unclear-ai-relevance","personal-data-heavy"],"refreshCadence":"weekly","mirrorAllowed":false,"mirrorPolicy":"source-code-review-only","notes":"Useful for open AI tooling and apps. Do not infer model/data redistribution rights from code license."},{"id":"openml","name":"OpenML","status":"planned","artifactTypes":["dataset","benchmark","evaluation"],"categories":["datasets","tasks","evaluations"],"ingestMode":"api-paginated","metadataUse":"candidate","metadataLicense":"OpenML metadata attribution required; per-dataset license controls reuse","termsUrl":"https://www.openml.org/terms","apiDocs":["https://docs.openml.org/ecosystem/Rest/"],"formats":["json","xml","arff","croissant"],"rateLimitPolicy":"Use documented REST endpoints conservatively and cache dataset/task metadata.","licenseExtractor":"dataset licence field plus DOI/source metadata.","provenanceFields":["dataset_id","task_id","flow_id","doi","url","version","licence"],"dedupeKeys":["openml:{datasetId}","openmlTask:{taskId}","doi","sourceUrl"],"skipRules":["missing-license","sensitive-data","custom-unclear-license","restricted"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"dataset-license-review-only","notes":"Good for classic ML datasets/tasks/evals. Dataset bytes require per-record license and size review before hosting."},{"id":"zenodo","name":"Zenodo","status":"planned","artifactTypes":["dataset","software","paper","research-artifact"],"categories":["datasets","papers","research-artifacts"],"ingestMode":"api-search","metadataUse":"candidate","metadataLicense":"Zenodo metadata CC0 except personal email addresses","termsUrl":"https://about.zenodo.org/policies/","apiDocs":["https://developers.zenodo.org/"],"formats":["json","oai-pmh","datacite"],"rateLimitPolicy":"Use REST search and OAI-PMH harvests conservatively; strip personal email fields.","licenseExtractor":"record metadata license, access_right, DOI/DataCite fields.","provenanceFields":["recordId","conceptrecid","doi","conceptdoi","version","access_right","license"],"dedupeKeys":["zenodo:{recordId}","zenodoConcept:{conceptrecid}","doi","sourceUrl"],"skipRules":["restricted","embargoed","missing-license","personal-email-only-contact"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"license-and-size-review-only","notes":"Good for DOI-backed research artifacts and datasets. Mirror only clearly licensed manageable files."},{"id":"arxiv","name":"arXiv","status":"planned","artifactTypes":["paper","method","benchmark"],"categories":["papers","methods","benchmarks"],"ingestMode":"api-or-oai","metadataUse":"candidate","metadataLicense":"arXiv metadata reuse with attribution and API terms","termsUrl":"https://info.arxiv.org/help/api/tou.html","apiDocs":["https://info.arxiv.org/help/api/index.html"],"formats":["atom","oai-pmh"],"rateLimitPolicy":"Follow arXiv API politeness limits and prefer OAI-PMH for bulk metadata.","licenseExtractor":"paper license when available; otherwise metadata-only paper record.","provenanceFields":["arxivId","version","doi","published","updated","categories","primaryCategory"],"dedupeKeys":["arxiv:{arxivId}","arxivVersion:{versionedId}","doi"],"skipRules":["withdrawn","no-ai-relevance","fulltext-mirroring"],"refreshCadence":"weekly","mirrorAllowed":false,"mirrorPolicy":"metadata-and-links-only","notes":"Good for paper and method context. Do not treat papers as redistributable model artifacts."},{"id":"paperswithcode","name":"Papers with Code","status":"planned","artifactTypes":["paper","benchmark","task","code-link"],"categories":["papers","benchmarks","tasks","leaderboards","code"],"ingestMode":"archive-or-api","metadataUse":"candidate","metadataLicense":"CC-BY-SA data archive; preserve attribution/share-alike obligations","termsUrl":"https://github.com/paperswithcode/paperswithcode-data","apiDocs":["https://paperswithcode.com/api/v1/docs/"],"formats":["json"],"rateLimitPolicy":"Prefer data archive snapshots for reproducibility; verify API availability before live pulls.","licenseExtractor":"paper/code links only; artifact license comes from linked source.","provenanceFields":["paperId","arxivId","doi","repoUrl","task","dataset","metric"],"dedupeKeys":["paperswithcode:{paperId}","arxiv:{arxivId}","github:{owner}/{repo}"],"skipRules":["stale-without-source","missing-attribution","unclear-code-link"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"metadata-and-links-only","notes":"Useful for task/eval pages and linking papers to code. Treat as context, not artifact authority."},{"id":"softwareheritage","name":"Software Heritage","status":"planned-support","artifactTypes":["provenance","source-archive"],"categories":["source-provenance","dedupe","archives"],"ingestMode":"point-lookup","metadataUse":"enrichment","metadataLicense":"SWH API metadata per Software Heritage terms","termsUrl":"https://www.softwareheritage.org/legal/api-terms-of-use/","apiDocs":["https://archive.softwareheritage.org/api/"],"formats":["json","swhid"],"rateLimitPolicy":"Do not mass-extract via public API; use point lookups or approved bulk access.","licenseExtractor":"not a license source; enrich provenance only.","provenanceFields":["swhid","origin","snapshot","revision","content"],"dedupeKeys":["swhid:{swhid}","sourceUrl"],"skipRules":["mass-public-api-extraction","personal-data-enrichment"],"refreshCadence":"on-demand","mirrorAllowed":false,"mirrorPolicy":"provenance-enrichment-only","notes":"Useful for durable source identity and dedupe, not as a primary artifact catalog."},{"id":"dataverse","name":"Dataverse instances","status":"legal-review","artifactTypes":["dataset","research-artifact"],"categories":["academic-datasets","research-data"],"ingestMode":"oai-or-native-api","metadataUse":"candidate","metadataLicense":"varies by instance and record","termsUrl":"https://guides.dataverse.org/en/latest/admin/harvestserver.html","apiDocs":["https://guides.dataverse.org/en/latest/api/native-api.html"],"formats":["json","json-ld","oai-ore","oai-pmh","croissant"],"rateLimitPolicy":"Harvest only published metadata from approved instances and respect instance policies.","licenseExtractor":"dataset terms/license from native API, JSON-LD, or terms metadata.","provenanceFields":["persistentId","datasetId","versionId","doi","license","publisher"],"dedupeKeys":["doi","dataverse:{persistentId}","sourceUrl"],"skipRules":["missing-license","restricted","embargoed","instance-terms-unknown"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"dataset-license-review-only","notes":"Good academic dataset coverage after instance-level terms review."},{"id":"semanticscholar","name":"Semantic Scholar","status":"legal-review","artifactTypes":["paper","citation-context"],"categories":["papers","citations","authors"],"ingestMode":"api-enrichment","metadataUse":"enrichment","metadataLicense":"Semantic Scholar API/data license requires product/legal approval","termsUrl":"https://www.semanticscholar.org/product/api","apiDocs":["https://api.semanticscholar.org/api-docs/"],"formats":["json"],"rateLimitPolicy":"Use API key, cache results, and treat as enrichment rather than artifact authority.","licenseExtractor":"not an artifact license source.","provenanceFields":["paperId","corpusId","doi","arxivId","year","fieldsOfStudy"],"dedupeKeys":["semanticScholar:{paperId}","corpus:{corpusId}","doi","arxiv:{arxivId}"],"skipRules":["abstract-fulltext-restriction","commercial-use-unapproved","author-personal-data-enrichment"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"metadata-enrichment-only","notes":"Useful for citation and paper graph context after terms review."},{"id":"kaggle","name":"Kaggle","status":"legal-review","artifactTypes":["dataset","notebook","model"],"categories":["datasets","notebooks","models"],"ingestMode":"api-cli","metadataUse":"candidate","metadataLicense":"varies by dataset/model and Kaggle API terms","termsUrl":"https://www.kaggle.com/terms","apiDocs":["https://github.com/Kaggle/kaggle-api"],"formats":["json","data-package","croissant"],"rateLimitPolicy":"Requires account/API credentials; index only public records allowed by terms.","licenseExtractor":"dataset-metadata license and record page metadata.","provenanceFields":["ownerSlug","datasetSlug","version","url","licenseName"],"dedupeKeys":["kaggle:{owner}/{slug}","sourceUrl"],"skipRules":["competition-data","missing-license","health-face-biometric-risk","restricted"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"metadata-first-terms-review","notes":"High dataset coverage, but API credentials and terms matter. Start with metadata and upstream links only."},{"id":"openvino-model-zoo","name":"OpenVINO Open Model Zoo","status":"planned","artifactTypes":["model"],"categories":["vision","speech","edge-inference"],"ingestMode":"github-yaml","metadataUse":"candidate","metadataLicense":"repo Apache-2.0, model source licenses vary","termsUrl":"https://docs.openvino.ai/2024/documentation/legacy-features/model-zoo.html","apiDocs":["https://github.com/openvinotoolkit/open_model_zoo"],"formats":["yaml","github"],"rateLimitPolicy":"Use GitHub raw metadata sparingly and cache per commit.","licenseExtractor":"per-model model.yml and upstream source metadata.","provenanceFields":["modelName","modelYamlPath","sourceUrl","checksum","commit"],"dedupeKeys":["openvinoModel:{modelName}","sourceUrl","checksum"],"skipRules":["source-license-unknown","maintenance-mode-risk","missing-checksum"],"refreshCadence":"quarterly","mirrorAllowed":false,"mirrorPolicy":"per-model-license-review-only","notes":"Useful historical/edge model catalog. Verify each upstream source before mirroring."},{"id":"modelscope","name":"ModelScope","status":"legal-review","artifactTypes":["model","dataset","space"],"categories":["models","datasets","spaces"],"ingestMode":"sdk-or-api","metadataUse":"candidate","metadataLicense":"varies by repo and ModelScope terms","termsUrl":"https://www.modelscope.cn/docs","apiDocs":["https://www.modelscope.cn/docs"],"formats":["json","repo-card"],"rateLimitPolicy":"Use official SDK/API if approved; verify jurisdiction, token, and terms first.","licenseExtractor":"repo card/license metadata normalized to policy allowlists.","provenanceFields":["repoType","repoId","namespace","revision","licenseRaw","sourceUrl"],"dedupeKeys":["modelscope:{repoType}:{repoId}","sourceUrl"],"skipRules":["terms-unapproved","unknown-license","gated-or-restricted"],"refreshCadence":"monthly","mirrorAllowed":false,"mirrorPolicy":"metadata-first-terms-review","notes":"Useful second model hub after legal/product review."},{"id":"tensorflow-hub","name":"TensorFlow Hub","status":"legal-review","artifactTypes":["model"],"categories":["tensorflow","tflite","tfjs"],"ingestMode":"approved-metadata-access-only","metadataUse":"candidate","metadataLicense":"terms review required","termsUrl":"https://www.tensorflow.org/hub","apiDocs":["https://www.tensorflow.org/hub"],"formats":["html","savedmodel","tflite","tfjs"],"rateLimitPolicy":"Use only if stable metadata access and robots/terms are approved; do not scrape aggressively.","licenseExtractor":"model page/license metadata where available.","provenanceFields":["tfhubHandle","publisher","model","version","sourceUrl"],"dedupeKeys":["tfhub:{publisher}/{model}/{version}","sourceUrl"],"skipRules":["no-bulk-api","terms-unapproved","unknown-license"],"refreshCadence":"quarterly","mirrorAllowed":false,"mirrorPolicy":"metadata-first-terms-review","notes":"Conditional source only after terms and access review."},{"id":"onnx-model-zoo","name":"ONNX Model Zoo","status":"historical","artifactTypes":["model"],"categories":["onnx","historical-models"],"ingestMode":"github-metadata","metadataUse":"candidate","metadataLicense":"repo/license varies by model history","termsUrl":"https://github.com/onnx/models","apiDocs":["https://github.com/onnx/models"],"formats":["github","onnx"],"rateLimitPolicy":"Treat as historical metadata; do not promise upstream downloads.","licenseExtractor":"repo/model metadata and license file where present.","provenanceFields":["modelPath","commit","checksum","sourceUrl"],"dedupeKeys":["onnxModel:{modelPath}","sourceUrl","checksum"],"skipRules":["deprecated-download","missing-license","missing-checksum"],"refreshCadence":"quarterly","mirrorAllowed":false,"mirrorPolicy":"external-historical-metadata-only","notes":"Historical ONNX catalog. Keep external metadata only unless a reviewed modern source exists."}]}