schema_info:
title: PubTator Central RDF
description: |
PubTator Central provides biomedical entity annotations extracted from PubMed literature using advanced text mining and manual curation. The database contains Disease and Gene annotations linked to their corresponding PubMed articles, using identifiers from MeSH and NCBI Gene databases. Each annotation indicates how many times an entity appears in an article, enabling literature-based biomedical discovery and knowledge graph construction. Annotations are sourced from both PubTator3 (automated) and ClinVar (curated). The RDF representation uses the Web Annotation Ontology to model entity-article relationships.
endpoint: https://rdfportal.org/ncbi/sparql
base_uri: http://purl.jp/bio/10/pubtator-central/
graphs:
- http://rdfportal.org/dataset/pubtator_central
- http://rdfportal.org/dataset/pubmed
version:
mie_version: "1.1"
mie_created: "2025-12-08"
data_version: "PubTator Central 2024"
update_frequency: "Regularly updated"
license:
data_license: "Public Domain"
license_url: "https://www.ncbi.nlm.nih.gov/home/about/policies/"
access:
rate_limiting: "Reasonable use policy"
max_query_timeout: "60 seconds"
shape_expressions: |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#>
<DiseaseAnnotationShape> {
a [ oa:Annotation ] ;
dcterms:subject [ "Disease" ] ;
oa:hasBody IRI ;
oa:hasTarget IRI ;
pubtator:annotation_count xsd:integer ;
dcterms:source xsd:string ?
}
<GeneAnnotationShape> {
a [ oa:Annotation ] ;
dcterms:subject [ "Gene" ] ;
oa:hasBody IRI ;
oa:hasTarget IRI ;
pubtator:annotation_count xsd:integer ;
dcterms:source xsd:string ?
}
sample_rdf_entries:
- title: "Disease Annotation Example"
description: "Annotation linking the MeSH disease 'Fecal Incontinence' (D056486) to PubMed article 18935170, appearing once in the literature."
rdf: |
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#> .
<http://purl.jp/bio/10/pubtator-central/Disease/20000000>
a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody <http://identifiers.org/mesh/D056486> ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/18935170> ;
pubtator:annotation_count 1 .
- title: "Gene Annotation Example"
description: "Annotation linking NCBI Gene 11820 to PubMed article 16821116, showing gene mentions in scientific literature."
rdf: |
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#> .
<http://purl.jp/bio/10/pubtator-central/Gene/5452014>
a oa:Annotation ;
dcterms:subject "Gene" ;
oa:hasBody <http://identifiers.org/ncbigene/11820> ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/16821116> ;
pubtator:annotation_count 1 .
- title: "High-Frequency Gene Annotation"
description: "Gene annotation with high annotation_count showing NCBI Gene 28964 appears multiple times in an article."
rdf: |
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#> .
<http://purl.jp/bio/10/pubtator-central/Gene/3895582>
a oa:Annotation ;
dcterms:subject "Gene" ;
oa:hasBody <http://identifiers.org/ncbigene/28964> ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/15383276> ;
pubtator:annotation_count 9 .
- title: "Disease with Standard Identifier"
description: "Disease annotation using identifiers.org namespace, linking to MeSH term D001724 (Birth Weight)."
rdf: |
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#> .
<http://purl.jp/bio/10/pubtator-central/Disease/20000001>
a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody <http://identifiers.org/mesh/D001724> ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/18935173> ;
pubtator:annotation_count 1 .
- title: "Disease Annotation with Source Attribution"
description: "Annotation with dcterms:source indicating data provenance from either PubTator3 (automated) or ClinVar (curated)."
rdf: |
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#> .
[]
a oa:Annotation ;
dcterms:subject "Disease" ;
dcterms:source "PubTator3" ;
oa:hasBody <http://identifiers.org/mesh/D003920> ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/18935173> ;
pubtator:annotation_count 2 .
sparql_query_examples:
- title: "Find all diseases mentioned in a specific PubMed article"
description: "Retrieves all disease annotations for a given PubMed ID"
question: "What diseases are mentioned in PubMed article 18935173?"
complexity: basic
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?disease ?diseaseId
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?disease a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody ?diseaseId ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/18935173> .
}
- title: "Find genes annotated in literature"
description: "Lists gene identifiers that appear in PubTator annotations"
question: "What genes are annotated in the database?"
complexity: basic
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT DISTINCT ?geneId
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann a oa:Annotation ;
dcterms:subject "Gene" ;
oa:hasBody ?geneId .
}
LIMIT 100
- title: "Find all annotations for a specific disease"
description: "Retrieves all PubMed articles that mention a specific disease by MeSH ID (D003920 = Diabetes Mellitus)"
question: "Which papers mention diabetes mellitus (MeSH:D003920)?"
complexity: intermediate
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?ann ?target
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody <http://identifiers.org/mesh/D003920> ;
oa:hasTarget ?target .
}
LIMIT 100
- title: "Find gene-disease co-mentions in articles"
description: "Identifies PubMed articles where both a gene and disease are annotated together"
question: "Which papers mention both genes and diseases?"
complexity: intermediate
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT DISTINCT ?article ?geneId ?diseaseId
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?geneAnn a oa:Annotation ;
dcterms:subject "Gene" ;
oa:hasBody ?geneId ;
oa:hasTarget ?article .
?diseaseAnn a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody ?diseaseId ;
oa:hasTarget ?article .
}
LIMIT 100
- title: "Find annotations with multiple mentions"
description: "Retrieves disease annotations where the entity is mentioned multiple times in an article"
question: "Which diseases have multiple mentions in articles?"
complexity: intermediate
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX pubtator: <http://purl.jp/bio/10/pubtator-central/ontology#>
SELECT ?ann ?diseaseId ?target ?count
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody ?diseaseId ;
oa:hasTarget ?target ;
pubtator:annotation_count ?count .
FILTER(?count > 1)
}
LIMIT 100
- title: "Search disease annotations by article keyword"
description: "Integrates PubTator with PubMed to find disease annotations in articles matching keyword search"
question: "Find disease annotations in cancer-related articles"
complexity: advanced
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX bibo: <http://purl.org/ontology/bibo/>
PREFIX dct: <http://purl.org/dc/terms/>
SELECT ?ann ?diseaseId ?article ?title
FROM <http://rdfportal.org/dataset/pubtator_central>
FROM <http://rdfportal.org/dataset/pubmed>
WHERE {
?ann a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody ?diseaseId ;
oa:hasTarget ?article .
?article bibo:pmid ?pmid ;
dct:title ?title .
?title bif:contains "'cancer'" .
}
LIMIT 100
- title: "Find genes frequently associated with a specific disease"
description: "Complex query identifying genes co-occurring with Alzheimer Disease (D000544) across articles"
question: "What genes are commonly mentioned with Alzheimer Disease?"
complexity: advanced
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX mesh: <http://identifiers.org/mesh/>
SELECT ?geneId (COUNT(DISTINCT ?article) AS ?cooccurrence)
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?geneAnn a oa:Annotation ;
dcterms:subject "Gene" ;
oa:hasBody ?geneId ;
oa:hasTarget ?article .
?diseaseAnn a oa:Annotation ;
dcterms:subject "Disease" ;
oa:hasBody mesh:D000544 ;
oa:hasTarget ?article .
}
GROUP BY ?geneId
ORDER BY DESC(?cooccurrence)
LIMIT 50
cross_references:
- pattern: "oa:hasBody with identifiers.org"
description: |
PubTator Central uses identifiers.org URIs to reference external databases. Disease annotations link to MeSH terms, while gene annotations link to NCBI Gene IDs. This standardized approach enables seamless integration with other biomedical resources.
databases:
diseases:
- "MeSH: Complete coverage of disease concepts via http://identifiers.org/mesh/"
genes:
- "NCBI Gene: Gene identifiers via http://identifiers.org/ncbigene/"
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?ann ?entityType ?externalId
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann a oa:Annotation ;
dcterms:subject ?entityType ;
oa:hasBody ?externalId .
}
LIMIT 100
- pattern: "oa:hasTarget to PubMed"
description: |
All annotations link to PubMed articles using NCBI's RDF URIs (http://rdf.ncbi.nlm.nih.gov/pubmed/). This enables queries across literature and facilitates citation network analysis.
databases:
literature:
- "PubMed: All annotations target PubMed articles"
sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
SELECT DISTINCT ?pubmedArticle
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann oa:hasTarget ?pubmedArticle .
FILTER(STRSTARTS(STR(?pubmedArticle), "http://rdf.ncbi.nlm.nih.gov/pubmed/"))
}
LIMIT 100
architectural_notes:
schema_design:
- "Uses Web Annotation Ontology (oa:Annotation) for modeling entity-article relationships"
- "Simple star schema with annotations as central nodes connecting entities to articles"
- "dcterms:subject provides entity type classification (Disease, Gene)"
- "annotation_count property tracks mention frequency within individual articles"
performance:
- "Large queries may timeout; use LIMIT and OFFSET for pagination"
- "Disease annotations are heavily represented in the dataset"
- "Gene queries generally perform well with specific entity filters"
- "Full-text search with bif:contains requires integration with PubMed graph"
data_integration:
- "identifiers.org namespace enables direct linking to MeSH and NCBI Gene"
- "PubMed URIs follow NCBI RDF conventions for seamless cross-database queries"
- "ClinVar and PubTator3 source attribution available via dcterms:source"
- "Can be joined with PubMed graph for title/abstract keyword searches"
data_quality:
- "Some annotations lack dcterms:source (provenance not always recorded)"
- "Annotation counts represent frequency within individual articles, not across corpus"
- "Database primarily contains Disease and Gene entity types"
- "Other entity types (Chemical, Species, Mutation) may have limited or no coverage"
data_statistics:
total_annotations: ">10 million (estimated)"
entity_types:
disease_annotations: "Majority"
gene_annotations: "Substantial coverage"
coverage:
mesh_disease_coverage: "Extensive MeSH disease term coverage"
ncbi_gene_coverage: "Broad gene identifier coverage"
cardinality:
avg_annotations_per_article: "Variable, typically 1-5"
annotation_count_range: "Typically 1-2, occasionally up to 9+ for highly mentioned entities"
performance_characteristics:
- "Simple lookups by annotation ID: Fast (<1s)"
- "Entity-specific queries with filters: Moderate (1-5s)"
- "Large aggregations without LIMIT: May timeout"
- "Cross-graph queries with PubMed: Moderate (2-10s)"
data_quality_notes:
- "Provenance (dcterms:source) not available for all annotations"
- "Gene annotations appear less frequently than disease annotations"
- "Data primarily focused on Disease and Gene entity types"
anti_patterns:
- title: "Querying without LIMIT causes timeouts"
problem: "Large result sets without pagination will timeout"
wrong_sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?ann ?body
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann dcterms:subject "Disease" ;
oa:hasBody ?body .
}
correct_sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?ann ?body
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann dcterms:subject "Disease" ;
oa:hasBody ?body .
}
LIMIT 100
explanation: "Always use LIMIT for exploratory queries to prevent timeouts on large datasets"
- title: "Using bif:contains on URI fields"
problem: "bif:contains doesn't work on URI fields like oa:hasBody; requires text fields"
wrong_sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
SELECT ?ann ?body
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann oa:hasBody ?body .
?body bif:contains "'cancer'" .
}
LIMIT 100
correct_sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX bibo: <http://purl.org/ontology/bibo/>
PREFIX dct: <http://purl.org/dc/terms/>
SELECT ?ann ?diseaseId ?title
FROM <http://rdfportal.org/dataset/pubtator_central>
FROM <http://rdfportal.org/dataset/pubmed>
WHERE {
?ann dcterms:subject "Disease" ;
oa:hasBody ?diseaseId ;
oa:hasTarget ?article .
?article dct:title ?title .
?title bif:contains "'cancer'" .
}
LIMIT 100
explanation: "Use bif:contains on text fields; integrate with PubMed graph to search article titles/abstracts"
- title: "Forgetting to specify entity type"
problem: "Queries without dcterms:subject filter return mixed entity types"
wrong_sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
SELECT ?ann ?body
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann oa:hasBody ?body ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/18935173> .
}
correct_sparql: |
PREFIX oa: <http://www.w3.org/ns/oa#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT ?ann ?body
FROM <http://rdfportal.org/dataset/pubtator_central>
WHERE {
?ann dcterms:subject "Disease" ;
oa:hasBody ?body ;
oa:hasTarget <http://rdf.ncbi.nlm.nih.gov/pubmed/18935173> .
}
explanation: "Always filter by dcterms:subject to get specific entity types (Disease, Gene)"
common_errors:
- error: "Query timeout on aggregation queries"
causes:
- "Attempting to aggregate over entire dataset without LIMIT"
- "Complex joins without selective filters"
- "Not using indexes effectively"
solutions:
- "Add LIMIT clause to restrict result size"
- "Use specific entity filters (dcterms:subject, oa:hasBody)"
- "Break large queries into smaller batches using OFFSET"
example_fix: |
# Before: Times out
SELECT ?diseaseId (COUNT(*) as ?count)
WHERE { ?ann oa:hasBody ?diseaseId . }
GROUP BY ?diseaseId
# After: Works efficiently
SELECT ?diseaseId (COUNT(*) as ?count)
WHERE {
?ann dcterms:subject "Disease" ;
oa:hasBody ?diseaseId .
}
GROUP BY ?diseaseId
LIMIT 100
- error: "Empty results when querying for specific entity types"
causes:
- "Incorrect assumption about available entity types"
- "Using wrong dcterms:subject values"
- "Case sensitivity in literal matching"
solutions:
- "Use 'Disease' and 'Gene' as confirmed entity types"
- "Check exact string matching with literals"
- "Query without filters first to explore available types"
example_fix: |
# Before: Returns nothing
SELECT ?ann WHERE {
?ann dcterms:subject "chemical" .
}
# After: Use confirmed types
SELECT ?ann WHERE {
?ann dcterms:subject "Disease" .
}
LIMIT 10
- error: "Not handling optional dcterms:source properly"
causes:
- "Assuming all annotations have source attribution"
- "Using non-optional pattern matching"
solutions:
- "Use OPTIONAL for dcterms:source"
- "Filter only when source is required"
- "Check for NULL values in results"
example_fix: |
# Before: Misses annotations without source
SELECT ?ann ?source WHERE {
?ann a oa:Annotation ;
dcterms:source ?source .
}
# After: Captures all annotations
SELECT ?ann ?source WHERE {
?ann a oa:Annotation .
OPTIONAL { ?ann dcterms:source ?source . }
}
LIMIT 100