Skip to main content
Glama
ncbigene.yaml18.3 kB
schema_info: title: NCBI Gene Database description: | Gene database with 57M+ entries covering protein-coding genes, ncRNAs, pseudogenes across all organisms. Includes gene symbols, descriptions, chromosomal locations, types, synonyms, and cross-references to Ensembl, HGNC, OMIM. Supports comparative genomics via orthology relationships and taxonomic classification. endpoint: https://rdfportal.org/ncbigene/sparql base_uri: http://identifiers.org/ncbigene/ graphs: - http://rdfportal.org/dataset/ncbigene version: mie_version: "1.2" mie_created: "2025-01-10" data_version: "November 2024" update_frequency: "Regular" license: data_license: "NCBI Public Domain" license_url: https://www.ncbi.nlm.nih.gov/home/about/policies/ access: rate_limiting: "Standard SPARQL limits" max_query_timeout: "60s" shape_expressions: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX dct: <http://purl.org/dc/terms/> PREFIX orth: <http://purl.org/net/orth#> <GeneShape> { a [ insdc:Gene ] ; dct:identifier xsd:integer ; rdfs:label xsd:string ; # Gene symbol (INS, BRCA1) dct:description xsd:string ? ; # Full name (insulin) insdc:gene_synonym xsd:string * ; dct:alternative xsd:string * ; ncbio:typeOfGene xsd:string ; ncbio:nomenclatureStatus xsd:string ? ; ncbio:fullName xsd:string ? ; ncbio:taxid IRI ; insdc:chromosome xsd:string ? ; insdc:map xsd:string ? ; insdc:dblink IRI * ; insdc:db_xref xsd:string * ; orth:hasOrtholog IRI * ; dct:modified xsd:date } sample_rdf_entries: - title: "Protein-Coding Gene (A1BG)" description: "Human gene with synonyms, location, cross-references." rdf: | @prefix insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> . @prefix ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix dct: <http://purl.org/dc/terms/> . <http://identifiers.org/ncbigene/1> a insdc:Gene ; rdfs:label "A1BG" ; dct:description "alpha-1-B glycoprotein" ; insdc:gene_synonym "A1B", "ABG", "GAB" ; ncbio:typeOfGene "protein-coding" ; insdc:chromosome "19" ; insdc:map "19q13.43" ; insdc:dblink <http://identifiers.org/ensembl/ENSG00000121410>, <http://identifiers.org/hgnc/5> ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . - title: "Non-Coding RNA (LINC00165)" description: "Long intergenic ncRNA with alternative names." rdf: | @prefix insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> . @prefix ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix dct: <http://purl.org/dc/terms/> . <http://identifiers.org/ncbigene/727701> a insdc:Gene ; rdfs:label "LINC00165" ; ncbio:typeOfGene "ncRNA" ; insdc:chromosome "21" ; dct:alternative "narcolepsy candidate-region 1 gene B" ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . - title: "Gene with Orthologs (INS)" description: "Insulin gene with cross-species relationships." rdf: | @prefix insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> . @prefix orth: <http://purl.org/net/orth#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix dct: <http://purl.org/dc/terms/> . <http://identifiers.org/ncbigene/3630> a insdc:Gene ; rdfs:label "INS" ; dct:description "insulin" ; orth:hasOrtholog <http://identifiers.org/ncbigene/16334>, <http://identifiers.org/ncbigene/24506> . - title: "Pseudogene (ACTG1P14)" description: "Non-functional gene copy." rdf: | @prefix insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> . @prefix ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . <http://identifiers.org/ncbigene/100418910> a insdc:Gene ; rdfs:label "ACTG1P14" ; ncbio:typeOfGene "pseudo" ; insdc:chromosome "9" . - title: "Multi-Chromosome Gene (SHOX)" description: "Gene on X and Y with multiple OMIM links." rdf: | @prefix insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . <http://identifiers.org/ncbigene/6473> a insdc:Gene ; rdfs:label "SHOX" ; insdc:chromosome "X|Y" ; insdc:dblink <http://identifiers.org/mim/312865>, <http://identifiers.org/mim/400020> . sparql_query_examples: - title: "Basic Gene Info" description: "Get gene metadata by ID" question: "What is gene 1?" complexity: basic sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?label ?description ?type FROM <http://rdfportal.org/dataset/ncbigene> WHERE { <http://identifiers.org/ncbigene/1> rdfs:label ?label ; ncbio:typeOfGene ?type . OPTIONAL { <http://identifiers.org/ncbigene/1> dct:description ?description } } - title: "List Genes by Type" description: "Filter genes by biological type" question: "Find protein-coding genes in human" complexity: basic sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?gene ?label FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene a insdc:Gene ; rdfs:label ?label ; ncbio:typeOfGene "protein-coding" ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . } LIMIT 10 - title: "Search Gene Symbols" description: "Full-text search on gene symbols with relevance scoring" question: "Find INS, BRCA1, or TP53" complexity: intermediate sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?gene ?label ?sc FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene a insdc:Gene ; rdfs:label ?label ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . ?label bif:contains "'INS' OR 'BRCA1' OR 'TP53'" option (score ?sc) . } ORDER BY DESC(?sc) - title: "Search Full Gene Names" description: "Keyword search in gene descriptions with biological context" question: "Find insulin-related genes" complexity: intermediate sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?gene ?label ?description ?sc FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene a insdc:Gene ; rdfs:label ?label ; dct:description ?description ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . ?description bif:contains "'insulin'" option (score ?sc) . } ORDER BY DESC(?sc) LIMIT 20 - title: "Boolean Keyword Search" description: "Complex text search with AND/OR/NOT for biological annotations" question: "Cancer genes but not suppressors" complexity: intermediate sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?gene ?label ?description ?sc FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene a insdc:Gene ; rdfs:label ?label ; dct:description ?description ; ncbio:typeOfGene "protein-coding" ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . ?description bif:contains "('cancer' OR 'tumor') AND NOT 'suppressor'" option (score ?sc) . } ORDER BY DESC(?sc) LIMIT 20 - title: "Orthology Analysis" description: "Find orthologous genes across species" question: "What are INS orthologs?" complexity: advanced sparql: | PREFIX orth: <http://purl.org/net/orth#> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?ortholog ?label ?taxid FROM <http://rdfportal.org/dataset/ncbigene> WHERE { <http://identifiers.org/ncbigene/3630> orth:hasOrtholog ?ortholog . ?ortholog rdfs:label ?label ; ncbio:taxid ?taxid . } LIMIT 100 - title: "Comprehensive Gene Annotation" description: "Aggregate all metadata with biological context" question: "Get complete gene profile" complexity: advanced sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX orth: <http://purl.org/net/orth#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?label ?description ?type ?chromosome ?map (GROUP_CONCAT(DISTINCT ?synonym; separator="; ") as ?synonyms) (GROUP_CONCAT(DISTINCT ?dblink; separator="; ") as ?links) (COUNT(DISTINCT ?ortholog) as ?ortho_count) FROM <http://rdfportal.org/dataset/ncbigene> WHERE { <http://identifiers.org/ncbigene/3630> rdfs:label ?label ; ncbio:typeOfGene ?type . OPTIONAL { <http://identifiers.org/ncbigene/3630> dct:description ?description } OPTIONAL { <http://identifiers.org/ncbigene/3630> insdc:chromosome ?chromosome } OPTIONAL { <http://identifiers.org/ncbigene/3630> insdc:map ?map } OPTIONAL { <http://identifiers.org/ncbigene/3630> insdc:gene_synonym ?synonym } OPTIONAL { <http://identifiers.org/ncbigene/3630> insdc:dblink ?dblink } OPTIONAL { <http://identifiers.org/ncbigene/3630> orth:hasOrtholog ?ortholog } } GROUP BY ?label ?description ?type ?chromosome ?map cross_references: - pattern: insdc:dblink (IRI links) description: | Primary external links via identifiers.org URIs to Ensembl, HGNC, OMIM. databases: Genomics: - Ensembl: gene annotations - HGNC: human nomenclature - OMIM: disease associations sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> SELECT ?gene ?external FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene insdc:dblink ?external . FILTER(STRSTARTS(STR(?external), "http://identifiers.org/ensembl/")) } LIMIT 10 - pattern: insdc:db_xref (string refs) description: | String-based references like "Database:ID". databases: Various: - AllianceGenome: integrated resources sparql: | PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/nucleotide/> SELECT ?gene ?xref FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene insdc:db_xref ?xref . FILTER(CONTAINS(?xref, "AllianceGenome")) } LIMIT 10 - pattern: orth:hasOrtholog description: | Bidirectional orthology for comparative genomics. databases: Internal: - NCBI Gene: within-database orthology sparql: | PREFIX orth: <http://purl.org/net/orth#> SELECT ?gene1 ?gene2 FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene1 orth:hasOrtholog ?gene2 . } LIMIT 10 - pattern: ncbio:taxid description: | Taxonomic classification via NCBI Taxonomy. databases: Taxonomy: - NCBI Taxonomy: organism classification sparql: | PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> SELECT ?gene ?taxid FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene ncbio:taxid ?taxid . } LIMIT 10 architectural_notes: schema_design: - Single Gene entity with comprehensive properties - identifiers.org URIs for cross-database consistency - Label=symbols (INS), description=full names (insulin) - IRI-based (dblink) vs string-based (db_xref) references - Bidirectional orthology enables comparative analysis performance: - Use bif:contains for text search (not REGEX/FILTER) - Filter by taxid early to reduce search space - Gene ID lookups fast (<1s) - Always add LIMIT to orthology queries - Include FROM clause for graph targeting data_integration: - identifiers.org enables seamless cross-database linking - HGNC for official nomenclature - Ensembl for detailed annotations - OMIM for disease associations - Taxonomy for phylogenetic context data_quality: - Official status via nomenclatureStatus - Synonyms capture historical names - Optional properties vary by curation level - Best coverage for model organisms data_statistics: total_genes: 57768578 coverage: typeOfGene: "100%" description: "~95%" chromosomal_location: "~80%" external_links: "~70%" orthology: "~40%" cardinality: avg_synonyms: 2.5 avg_links: 1.8 avg_orthologs: 150 gene_types: protein_coding: 46105590 ncRNA: 3476823 tRNA: 3030346 pseudo: 2731789 rRNA: 993602 performance: - "Gene ID lookup: <1s" - "Type filtering: 1-5s" - "bif:contains search: 1-3s" - "Orthology queries: 10-30s" data_quality: - Best curated for human, mouse, rat - Coverage decreases for non-model organisms - Orthology most complete between close species anti_patterns: - title: "Searching without organism filter" problem: "Timeout with 57M+ genes" wrong_sparql: | SELECT ?gene ?description WHERE { ?gene dct:description ?description . FILTER(CONTAINS(?description, "insulin")) } correct_sparql: | PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?gene ?description ?sc FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene ncbio:taxid <http://identifiers.org/taxonomy/9606> ; dct:description ?description . ?description bif:contains "'insulin'" option (score ?sc) . } ORDER BY DESC(?sc) LIMIT 100 explanation: "Filter by taxid first; use bif:contains not FILTER" - title: "Searching full names in label" problem: "No results when searching 'insulin' in label" wrong_sparql: | PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?gene ?label WHERE { ?gene rdfs:label ?label . ?label bif:contains "'insulin'" option (score ?sc) . } correct_sparql: | PREFIX ncbio: <https://dbcls.github.io/ncbigene-rdf/ontology.ttl#> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?gene ?description ?sc FROM <http://rdfportal.org/dataset/ncbigene> WHERE { ?gene dct:description ?description ; ncbio:taxid <http://identifiers.org/taxonomy/9606> . ?description bif:contains "'insulin'" option (score ?sc) . } ORDER BY DESC(?sc) explanation: "Label=symbols (INS), description=full names (insulin)" - title: "Unbounded orthology queries" problem: "Millions of results cause timeout" wrong_sparql: | PREFIX orth: <http://purl.org/net/orth#> SELECT ?gene1 ?gene2 WHERE { ?gene1 orth:hasOrtholog ?gene2 . } correct_sparql: | PREFIX orth: <http://purl.org/net/orth#> SELECT ?gene2 FROM <http://rdfportal.org/dataset/ncbigene> WHERE { <http://identifiers.org/ncbigene/3630> orth:hasOrtholog ?gene2 . } LIMIT 100 explanation: "Start with specific gene; always add LIMIT" common_errors: - error: "Query timeout or no results" causes: - "No organism filter on 57M+ genes" - "FILTER/CONTAINS instead of bif:contains" - "Missing FROM clause" - "Unbounded orthology traversal" solutions: - "Filter by ncbio:taxid early" - "Use bif:contains for text search" - "Include FROM <http://rdfportal.org/dataset/ncbigene>" - "Add LIMIT (10-1000)" - "Start with specific gene IDs" example_fix: | # Wrong: FILTER(CONTAINS(?desc, "insulin")) # Right: ?desc bif:contains "'insulin'" option (score ?sc) - error: "Empty results for valid IDs" causes: - "Wrong identifier format" - "Missing full URI" - "No FROM clause" - "Discontinued gene ID" solutions: - "Use full URI: <http://identifiers.org/ncbigene/ID>" - "Verify ID exists in NCBI Gene" - "Include FROM clause" - "Check dct:modified for currency" example_fix: | # Wrong: ?gene dct:identifier 1 # Right: <http://identifiers.org/ncbigene/1> ?p ?o - error: "Missing external links" causes: - "Using only dblink, not db_xref" - "Not using OPTIONAL blocks" - "Poor coverage for organism" solutions: - "Query both insdc:dblink and insdc:db_xref" - "Wrap in OPTIONAL blocks" - "Filter by well-curated organisms" example_fix: | # Incomplete: ?gene insdc:dblink ?link # Complete: OPTIONAL { ?gene insdc:dblink ?link } OPTIONAL { ?gene insdc:db_xref ?xref }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arkinjo/togo-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server