schema_info:
title: Protein Data Bank (PDB)
description: |
3D structural data for proteins, nucleic acids, and complexes from X-ray, NMR, and cryo-EM with 204K+ entries. Contains experimental methods, resolution, R-factors, sequences, and cross-references to UniProt, EMDB, GenBank. Supports structure quality assessment, experimental method analysis, and biological function studies.
endpoint: https://rdfportal.org/backend/pdb/sparql
base_uri: http://rdf.wwpdb.org/pdb/
graphs:
- http://rdfportal.org/dataset/pdbj
version:
mie_version: "1.3"
mie_created: "2025-01-10"
data_version: "Current"
update_frequency: "Weekly"
license:
data_license: "Public Domain"
license_url: https://www.wwpdb.org
access:
rate_limiting: "Reasonable use"
max_query_timeout: "60s"
backend: "Virtuoso"
shape_expressions: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dct: <http://purl.org/dc/terms/>
<Datablock> {
a [ pdbx:datablock ] ;
pdbx:has_entryCategory @<EntryCategory> ;
pdbx:has_entityCategory @<EntityCategory> * ;
pdbx:has_entity_polyCategory @<EntityPolyCategory> * ;
pdbx:has_exptlCategory @<ExptlCategory> + ;
pdbx:has_refineCategory @<RefineCategory> * ;
pdbx:has_struct_refCategory @<StructRefCategory> * ;
pdbx:has_database_2Category @<Database2Category> * ;
pdbx:has_citationCategory @<CitationCategory> * ;
pdbx:has_structCategory @<StructCategory> ;
pdbx:has_struct_keywordsCategory @<StructKeywordsCategory> * ;
pdbx:has_cellCategory @<CellCategory> * ;
pdbx:has_symmetryCategory @<SymmetryCategory> * ;
pdbx:has_softwareCategory @<SoftwareCategory> * ;
pdbx:has_pdbx_struct_assemblyCategory @<AssemblyCategory> * ;
pdbx:has_struct_asymCategory @<StructAsymCategory> * ;
pdbx:has_audit_authorCategory @<AuditAuthorCategory> *
}
<Entry> {
a [ pdbx:entry ] ;
pdbx:entry.id xsd:string ;
pdbx:of_datablock IRI
}
<Entity> {
a [ pdbx:entity ] ;
pdbx:entity.id xsd:string ;
pdbx:entity.type [ "polymer" "non-polymer" "water" "macrolide" "branched" ] ;
pdbx:entity.pdbx_description xsd:string ? ;
pdbx:entity.formula_weight xsd:decimal ? ;
pdbx:entity.pdbx_number_of_molecules xsd:integer ? ;
pdbx:entity.src_method xsd:string ? ;
pdbx:referenced_by_entity_poly IRI * ;
pdbx:referenced_by_struct_ref IRI * ;
pdbx:referenced_by_struct_asym IRI * ;
pdbx:of_datablock IRI
}
<EntityPoly> {
a [ pdbx:entity_poly ] ;
pdbx:entity_poly.entity_id xsd:string ;
pdbx:entity_poly.type xsd:string ; # polypeptide(L), polyribonucleotide, polydeoxyribonucleotide
pdbx:entity_poly.pdbx_seq_one_letter_code xsd:string ? ;
pdbx:entity_poly.pdbx_seq_one_letter_code_can xsd:string ? ;
pdbx:entity_poly.pdbx_strand_id xsd:string ? ;
pdbx:reference_to_entity IRI ;
pdbx:of_datablock IRI
}
<Refine> {
a [ pdbx:refine ] ;
pdbx:refine.entry_id xsd:string ;
pdbx:refine.pdbx_refine_id xsd:string ;
pdbx:refine.ls_d_res_high xsd:decimal ? ;
pdbx:refine.ls_d_res_low xsd:decimal ? ;
pdbx:refine.ls_R_factor_R_work xsd:decimal ? ;
pdbx:refine.ls_R_factor_R_free xsd:decimal ? ;
pdbx:refine.ls_number_reflns_obs xsd:integer ? ;
pdbx:refine.ls_percent_reflns_obs xsd:decimal ? ;
pdbx:reference_to_entry IRI ;
pdbx:of_datablock IRI
}
<Exptl> {
a [ pdbx:exptl ] ;
pdbx:exptl.entry_id xsd:string ;
pdbx:exptl.method xsd:string ; # X-RAY DIFFRACTION, ELECTRON MICROSCOPY, SOLUTION NMR
pdbx:exptl.crystals_number xsd:integer ? ;
pdbx:reference_to_entry IRI ;
pdbx:of_datablock IRI
}
<StructRef> {
a [ pdbx:struct_ref ] ;
pdbx:struct_ref.id xsd:string ;
pdbx:struct_ref.db_name xsd:string ; # UNP, GB, EMBL, REF
pdbx:struct_ref.db_code xsd:string ? ;
pdbx:struct_ref.pdbx_db_accession xsd:string ? ;
pdbx:struct_ref.entity_id xsd:string ? ;
pdbx:reference_to_entity IRI * ;
pdbx:of_datablock IRI
}
<Database2> {
a [ pdbx:database_2 ] ;
pdbx:database_2.database_id xsd:string ; # PDB, WWPDB, EMDB, BMRB
pdbx:database_2.database_code xsd:string ;
pdbx:of_datablock IRI
}
<Citation> {
a [ pdbx:citation ] ;
pdbx:citation.id xsd:string ;
pdbx:citation.title xsd:string ? ;
pdbx:citation.journal_abbrev xsd:string ? ;
pdbx:citation.journal_volume xsd:string ? ;
pdbx:citation.page_first xsd:string ? ;
pdbx:citation.page_last xsd:string ? ;
pdbx:citation.year xsd:integer ? ;
pdbx:citation.pdbx_database_id_PubMed xsd:string ? ;
pdbx:citation.pdbx_database_id_DOI xsd:string ? ;
pdbx:link_to_doi IRI ? ;
pdbx:link_to_pubmed IRI ? ;
dct:references IRI * ;
pdbx:referenced_by_citation_author IRI * ;
pdbx:of_datablock IRI
}
<Struct> {
a [ pdbx:struct ] ;
pdbx:struct.entry_id xsd:string ;
pdbx:struct.title xsd:string ? ;
pdbx:struct.pdbx_descriptor xsd:string ? ;
pdbx:struct.pdbx_model_details xsd:string ? ;
pdbx:reference_to_entry IRI ;
pdbx:of_datablock IRI
}
<StructKeywords> {
a [ pdbx:struct_keywords ] ;
pdbx:struct_keywords.entry_id xsd:string ;
pdbx:struct_keywords.pdbx_keywords xsd:string ? ;
pdbx:struct_keywords.text xsd:string ? ;
pdbx:reference_to_entry IRI ;
pdbx:of_datablock IRI
}
<Cell> {
a [ pdbx:cell ] ;
pdbx:cell.entry_id xsd:string ;
pdbx:cell.length_a xsd:decimal ? ;
pdbx:cell.length_b xsd:decimal ? ;
pdbx:cell.length_c xsd:decimal ? ;
pdbx:cell.angle_alpha xsd:decimal ? ;
pdbx:cell.angle_beta xsd:decimal ? ;
pdbx:cell.angle_gamma xsd:decimal ? ;
pdbx:cell.Z_PDB xsd:integer ? ;
pdbx:reference_to_entry IRI ;
pdbx:of_datablock IRI
}
<Symmetry> {
a [ pdbx:symmetry ] ;
pdbx:symmetry.entry_id xsd:string ;
pdbx:symmetry.space_group_name_H-M xsd:string ? ;
pdbx:symmetry.Int_Tables_number xsd:integer ? ;
pdbx:reference_to_entry IRI ;
pdbx:of_datablock IRI
}
<Software> {
a [ pdbx:software ] ;
pdbx:software.name xsd:string ? ;
pdbx:software.version xsd:string ? ;
pdbx:software.classification xsd:string ? ; # data collection, refinement, data reduction, data scaling
pdbx:software.pdbx_ordinal xsd:integer ? ;
pdbx:of_datablock IRI
}
<StructAssembly> {
a [ pdbx:pdbx_struct_assembly ] ;
pdbx:pdbx_struct_assembly.id xsd:string ;
pdbx:pdbx_struct_assembly.details xsd:string ? ;
pdbx:pdbx_struct_assembly.oligomeric_count xsd:integer ? ;
pdbx:pdbx_struct_assembly.oligomeric_details xsd:string ? ;
pdbx:referenced_by_pdbx_struct_assembly_gen IRI * ;
pdbx:of_datablock IRI
}
<StructAsym> {
a [ pdbx:struct_asym ] ;
pdbx:struct_asym.id xsd:string ;
pdbx:struct_asym.entity_id xsd:string ? ;
pdbx:struct_asym.pdbx_blank_PDB_chainid_flag [ "Y" "N" ] ? ;
pdbx:reference_to_entity IRI ? ;
pdbx:of_datablock IRI
}
sample_rdf_entries:
- title: "High-Resolution Protein (3NIR)"
description: "Ultra-high resolution structure with refinement statistics."
rdf: |
@prefix pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
pdb:3NIR a pdbx:datablock ;
pdbx:has_refineCategory pdb:3NIR/refineCategory .
pdb:3NIR/refine/3NIR,X-RAY_DIFFRACTION a pdbx:refine ;
pdbx:refine.ls_d_res_high "0.48"^^xsd:decimal ;
pdbx:refine.ls_R_factor_R_work "0.127"^^xsd:decimal .
- title: "DNA Structure (100D)"
description: "Dickerson dodecamer with sequence."
rdf: |
@prefix pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#> .
pdb:100D/entity/1 a pdbx:entity ;
pdbx:entity.id "1" ;
pdbx:entity.type "polymer" ;
pdbx:entity.pdbx_description "DNA/RNA (5'-R(*CP*)-D(*CP*GP*GP*CP*GP*CP*CP*GP*)-R(*G)-3')" .
- title: "Protein with UniProt (16PK)"
description: "Protein with UniProt cross-reference."
rdf: |
@prefix pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#> .
pdb:16PK/struct_ref/1 a pdbx:struct_ref ;
pdbx:struct_ref.db_name "UNP" ;
pdbx:struct_ref.pdbx_db_accession "P07378" .
- title: "Cryo-EM Structure (8A2Z)"
description: "EM structure with EMDB reference."
rdf: |
@prefix pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#> .
pdb:8A2Z/database_2/1 a pdbx:database_2 ;
pdbx:database_2.database_id "EMDB" ;
pdbx:database_2.database_code "EMD-15109" .
pdb:8A2Z/exptl/1 a pdbx:exptl ;
pdbx:exptl.method "ELECTRON MICROSCOPY" .
- title: "RNA Structure (300D)"
description: "Hammerhead ribozyme with sequence."
rdf: |
@prefix pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#> .
pdb:300D/entity_poly/1 a pdbx:entity_poly ;
pdbx:entity_poly.type "polyribonucleotide" ;
pdbx:entity_poly.pdbx_seq_one_letter_code "GUGGUCUGAUGAGGCC" .
sparql_query_examples:
- title: "Search by Keywords"
description: "Find structures by classification keywords with biological annotations"
question: "Find kinase structures"
complexity: basic
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?title ?keywords
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_structCategory/pdbx:has_struct ?struct ;
pdbx:has_struct_keywordsCategory/pdbx:has_struct_keywords ?kw .
?struct pdbx:struct.title ?title .
?kw pdbx:struct_keywords.pdbx_keywords ?keywords .
FILTER(CONTAINS(LCASE(?keywords), "kinase"))
}
LIMIT 20
- title: "Software Usage Statistics"
description: "Analyze software tools used in structure determination"
question: "What software is most used for refinement?"
complexity: basic
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?software_name ?classification (COUNT(?entry) as ?usage_count)
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry pdbx:has_softwareCategory/pdbx:has_software ?sw .
?sw pdbx:software.name ?software_name ;
pdbx:software.classification ?classification .
FILTER(?classification = "refinement")
}
GROUP BY ?software_name ?classification
ORDER BY DESC(?usage_count)
LIMIT 15
- title: "Crystallographic Parameters"
description: "Find X-ray structures with unit cell and symmetry information"
question: "What are the cell parameters for small unit cells?"
complexity: intermediate
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?entry_id ?a ?b ?c ?space_group
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_cellCategory/pdbx:has_cell ?cell ;
pdbx:has_symmetryCategory/pdbx:has_symmetry ?sym ;
pdbx:has_exptlCategory/pdbx:has_exptl/pdbx:exptl.method "X-RAY DIFFRACTION" .
?cell pdbx:cell.length_a ?a ;
pdbx:cell.length_b ?b ;
pdbx:cell.length_c ?c .
?sym pdbx:symmetry.space_group_name_H-M ?space_group .
FILTER(xsd:decimal(?a) > 20 && xsd:decimal(?a) < 50)
}
LIMIT 20
- title: "Biological Assemblies"
description: "Find quaternary structures with oligomeric state annotations"
question: "Which structures form tetramers?"
complexity: intermediate
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?assembly_id ?oligomeric_count ?oligomeric_details
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_pdbx_struct_assemblyCategory/pdbx:has_pdbx_struct_assembly ?assembly .
?assembly pdbx:pdbx_struct_assembly.id ?assembly_id ;
pdbx:pdbx_struct_assembly.oligomeric_count ?oligomeric_count ;
pdbx:pdbx_struct_assembly.oligomeric_details ?oligomeric_details .
FILTER(CONTAINS(?oligomeric_details, "tetrameric"))
}
LIMIT 20
- title: "Search Publications"
description: "Find structures by publication title keywords with citation metadata"
question: "Find CRISPR-related structures with publications"
complexity: intermediate
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?title ?year ?journal ?doi
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_citationCategory/pdbx:has_citation ?citation .
?citation pdbx:citation.title ?title ;
pdbx:citation.year ?year ;
pdbx:citation.journal_abbrev ?journal .
OPTIONAL { ?citation pdbx:citation.pdbx_database_id_DOI ?doi }
FILTER(CONTAINS(LCASE(?title), "crispr"))
}
LIMIT 20
- title: "Comprehensive Structure Analysis"
description: "Multi-category query combining experimental, quality, and metadata"
question: "What are high-quality recent kinase structures?"
complexity: advanced
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?entry_id ?title ?resolution ?r_work ?year ?keywords
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_structCategory/pdbx:has_struct ?struct ;
pdbx:has_struct_keywordsCategory/pdbx:has_struct_keywords ?kw ;
pdbx:has_refineCategory/pdbx:has_refine ?refine ;
pdbx:has_citationCategory/pdbx:has_citation ?citation .
?struct pdbx:struct.title ?title .
?kw pdbx:struct_keywords.pdbx_keywords ?keywords .
?refine pdbx:refine.ls_d_res_high ?resolution ;
pdbx:refine.ls_R_factor_R_work ?r_work .
?citation pdbx:citation.year ?year .
FILTER(CONTAINS(LCASE(?keywords), "kinase"))
FILTER(xsd:decimal(?resolution) < 2.0)
FILTER(xsd:integer(?year) >= 2020)
}
ORDER BY xsd:decimal(?resolution)
LIMIT 20
- title: "Software Pipeline Analysis"
description: "Trace complete data processing workflow through software categories"
question: "What software pipelines are used together?"
complexity: advanced
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id
(GROUP_CONCAT(DISTINCT ?collection; separator=", ") as ?collection_sw)
(GROUP_CONCAT(DISTINCT ?reduction; separator=", ") as ?reduction_sw)
(GROUP_CONCAT(DISTINCT ?refinement; separator=", ") as ?refinement_sw)
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_softwareCategory/pdbx:has_software ?sw1, ?sw2, ?sw3 .
?sw1 pdbx:software.name ?collection ;
pdbx:software.classification "data collection" .
?sw2 pdbx:software.name ?reduction ;
pdbx:software.classification "data reduction" .
?sw3 pdbx:software.name ?refinement ;
pdbx:software.classification "refinement" .
}
GROUP BY ?entry_id
LIMIT 20
cross_references:
- pattern: pdbx:struct_ref
description: |
Sequence database cross-references with db_name and pdbx_db_accession.
databases:
Protein:
- UNP (UniProt): 352K refs, ~172%
- PIR: 131
Nucleotide:
- GB (GenBank): 5.9K
- EMBL: 84
- REF (RefSeq): 49
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?db_name ?db_accession
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_struct_refCategory/pdbx:has_struct_ref ?ref .
?ref pdbx:struct_ref.db_name ?db_name ;
pdbx:struct_ref.pdbx_db_accession ?db_accession .
}
LIMIT 30
- pattern: pdbx:database_2
description: |
External database links for WWPDB partners, EMDB, BMRB.
databases:
WWPDB:
- PDB: 100%
- WWPDB: 100%
- RCSB: ~41%
- PDBE: ~8%
Structure:
- EMDB: ~7%
- BMRB: ~3%
- NDB: ~2%
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?db_id ?db_code
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_database_2Category/pdbx:has_database_2 ?db .
?db pdbx:database_2.database_id ?db_id ;
pdbx:database_2.database_code ?db_code .
}
LIMIT 30
- pattern: pdbx:citation
description: |
Publication references with DOI and PubMed identifiers.
databases:
Publications:
- DOI: ~75%
- PubMed: ~73%
sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?doi ?pubmed
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_citationCategory/pdbx:has_citation ?citation .
OPTIONAL { ?citation pdbx:citation.pdbx_database_id_DOI ?doi }
OPTIONAL { ?citation pdbx:citation.pdbx_database_id_PubMed ?pubmed }
FILTER(BOUND(?doi) || BOUND(?pubmed))
}
LIMIT 20
architectural_notes:
schema_design:
- PDBx/mmCIF ontology with Category-Item design pattern
- Datablock root with has_*Category properties linking to specialized categories
- Each category contains has_* properties linking to individual items
- Categories include entity, entity_poly, exptl, refine, struct_ref, database_2, citation, struct, cell, symmetry, software, pdbx_struct_assembly, struct_asym
- Strong separation between structural data, experiments, refinement, cross-references, metadata
- Bidirectional references (e.g., reference_to_entry, referenced_by_*)
performance:
- 204K+ entries with 900K+ entities require optimization
- Use xsd:decimal() for resolution/R-factor numeric comparisons
- Use CONTAINS/bif:contains for keyword/title searches
- Filter by pdbx:datablock type to get entry-level data
- Multi-join queries across categories need entry_id filtering early
- Category traversal (has_*Category/has_*) efficient for targeted queries
data_integration:
- UniProt for protein sequences (352K refs, ~172% per entry)
- EMDB for cryo-EM density maps (~7%)
- GenBank/EMBL/RefSeq for nucleotide sequences
- DOI/PubMed for publications (~75%/73%)
- link_to_doi and link_to_pubmed for direct external links
- dct:references for identifier-based references
data_quality:
- Resolution and R-factors indicate X-ray structure quality
- Multiple UniProt refs per entry common (one per chain)
- NMR structures lack resolution data
- EM structures may lack R-factors
- Cell and symmetry data only for crystallographic methods
- Software metadata varies by deposition date
data_statistics:
total_entries: 204594
experimental_methods:
xray: "~85%"
em: "~7%"
nmr: "~7%"
coverage:
uniprot: "~172%"
emdb: "~7%"
doi: "~75%"
pubmed: "~73%"
resolution: ">85%"
cell_data: "~85%"
symmetry: "~85%"
software: "~90%"
keywords: "~95%"
assemblies: "~90%"
cardinality:
avg_uniprot_refs: 1.72
avg_entities: 4.4
avg_assemblies: 1.2
avg_software_entries: 3.5
cross_reference_counts:
uniprot: 352114
genbank: 5874
emdb: 13974
doi: 186683
pubmed: 181261
software_usage:
top_refinement: PHENIX (72K), REFMAC (70K)
top_phasing: PHASER (62K)
top_reduction: XDS (51K), HKL-2000 (40K)
performance:
- "Resolution queries: efficient with numeric filters"
- "Cross-reference by db_name: works well"
- "Keyword searches: use CONTAINS on keywords field"
- "Category traversal: 2-3 property path steps optimal"
- "Recommend LIMIT 20-100"
anti_patterns:
- title: "Missing Numeric Conversion"
problem: "Resolution comparisons fail without casting"
wrong_sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?resolution
WHERE {
?refine pdbx:refine.ls_d_res_high ?resolution .
FILTER(?resolution < 1.0)
}
correct_sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?resolution
WHERE {
?refine pdbx:refine.ls_d_res_high ?resolution .
FILTER(xsd:decimal(?resolution) > 0 && xsd:decimal(?resolution) < 1.0)
}
explanation: "Always use xsd:decimal() for numeric comparisons"
- title: "Inefficient Keyword Search"
problem: "Not using indexed keyword fields"
wrong_sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?title
WHERE {
?entry pdbx:has_structCategory/pdbx:has_struct ?struct .
?struct pdbx:struct.title ?title .
FILTER(CONTAINS(LCASE(?title), "kinase"))
}
correct_sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id ?keywords
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_struct_keywordsCategory/pdbx:has_struct_keywords ?kw .
?kw pdbx:struct_keywords.pdbx_keywords ?keywords .
FILTER(CONTAINS(LCASE(?keywords), "kinase"))
}
LIMIT 20
explanation: "Use struct_keywords for faster searches"
- title: "Missing Entry Filter"
problem: "Not filtering by datablock returns non-entries"
wrong_sparql: |
SELECT ?entry
WHERE {
?entry pdbx:has_refineCategory ?refine .
}
correct_sparql: |
PREFIX pdbx: <http://rdf.wwpdb.org/schema/pdbx-v50.owl#>
SELECT ?entry_id
FROM <http://rdfportal.org/dataset/pdbj>
WHERE {
?entry a pdbx:datablock .
BIND(STRAFTER(str(?entry), "http://rdf.wwpdb.org/pdb/") AS ?entry_id)
?entry pdbx:has_refineCategory ?refine .
}
explanation: "Filter by pdbx:datablock and extract entry_id"
common_errors:
- error: "Type conversion errors"
causes:
- "Comparing resolution strings without xsd:decimal()"
- "Missing numeric filters"
solutions:
- "Always use xsd:decimal() for resolution/R-factors"
- "Filter out invalid values (< 0 or > 100)"
example_fix: |
# Wrong: FILTER(?resolution < 1.0)
# Right: FILTER(xsd:decimal(?resolution) > 0 && xsd:decimal(?resolution) < 1.0)
- error: "Missing optional data"
causes:
- "NMR structures lack resolution"
- "Not using OPTIONAL for cross-references"
- "Software metadata not always complete"
solutions:
- "Use OPTIONAL for resolution, R-factors"
- "Filter by experimental method if requiring resolution"
- "Use OPTIONAL for software, citations, assemblies"
- error: "Query timeout"
causes:
- "Using CONTAINS on long text fields"
- "Not limiting results"
- "Missing FROM clause"
- "Complex multi-category joins without filters"
solutions:
- "Add LIMIT 20-100"
- "Include FROM clause"
- "Use keyword fields instead of titles"
- "Filter by entry_id early in multi-joins"