test_extract_gene_aa_change.py•5.84 kB
"""Tests for _extract_gene_aa_change method in external.py."""
import pytest
from biomcp.variants.external import ExternalVariantAggregator
class TestExtractGeneAAChange:
"""Test the _extract_gene_aa_change method."""
@pytest.fixture
def aggregator(self):
"""Create an ExternalVariantAggregator instance."""
return ExternalVariantAggregator()
def test_extract_from_docm(self, aggregator):
"""Test extraction from DOCM data."""
variant_data = {"docm": {"gene": "BRAF", "aa_change": "p.V600E"}}
result = aggregator._extract_gene_aa_change(variant_data)
assert result == "BRAF V600E"
def test_extract_from_hgvsp_long_format(self, aggregator):
"""Test extraction from hgvsp with long amino acid names."""
variant_data = {
"cadd": {"gene": {"genename": "TP53"}},
"hgvsp": ["p.Arg175His"],
}
result = aggregator._extract_gene_aa_change(variant_data)
# The code doesn't convert all long forms, just checks for Val/Ala
assert result == "TP53 Arg175His"
def test_extract_from_hgvsp_with_dbnsfp(self, aggregator):
"""Test extraction from hgvsp with dbnsfp gene name."""
variant_data = {
"dbnsfp": {"genename": "EGFR"},
"hgvsp": ["p.Leu858Arg"],
}
result = aggregator._extract_gene_aa_change(variant_data)
# The code doesn't convert Leu/Arg to L/R
assert result == "EGFR Leu858Arg"
def test_extract_from_cadd_data(self, aggregator):
"""Test extraction from CADD annotations."""
variant_data = {
"cadd": {
"gene": {"genename": "KRAS", "prot": {"protpos": 12}},
"oaa": "G",
"naa": "D",
}
}
result = aggregator._extract_gene_aa_change(variant_data)
assert result == "KRAS G12D"
def test_extract_from_docm_without_p_prefix(self, aggregator):
"""Test extraction from DOCM without p. prefix."""
variant_data = {"docm": {"gene": "PIK3CA", "aa_change": "E545K"}}
result = aggregator._extract_gene_aa_change(variant_data)
assert result == "PIK3CA E545K"
def test_extract_with_multiple_hgvsp(self, aggregator):
"""Test handling of multiple hgvsp entries - should take first."""
variant_data = {
"cadd": {"gene": {"genename": "BRCA1"}},
"hgvsp": ["p.Gln1756Ter", "p.Gln1756*"],
}
result = aggregator._extract_gene_aa_change(variant_data)
# Takes the first one, doesn't convert Gln/Ter
assert result == "BRCA1 Gln1756Ter"
def test_extract_with_special_characters(self, aggregator):
"""Test extraction with special characters in protein change."""
variant_data = {
"cadd": {"gene": {"genename": "MLH1"}},
"hgvsp": ["p.Lys618Alafs*9"],
}
result = aggregator._extract_gene_aa_change(variant_data)
# Should extract the basic AA change pattern
assert result is not None
assert "MLH1" in result
def test_extract_no_gene_name(self, aggregator):
"""Test when gene name is missing."""
variant_data = {"hgvsp": ["p.Val600Glu"]}
result = aggregator._extract_gene_aa_change(variant_data)
assert result is None
def test_extract_no_aa_change(self, aggregator):
"""Test when AA change is missing."""
variant_data = {"cadd": {"gene": {"genename": "BRAF"}}}
result = aggregator._extract_gene_aa_change(variant_data)
assert result is None
def test_extract_empty_variant_data(self, aggregator):
"""Test with empty variant data."""
result = aggregator._extract_gene_aa_change({})
assert result is None
def test_extract_malformed_hgvsp(self, aggregator):
"""Test with malformed HGVS protein notation."""
variant_data = {
"clinvar": {
"gene": {"symbol": "MYC"},
"hgvs": {"protein": ["invalid_format"]},
}
}
result = aggregator._extract_gene_aa_change(variant_data)
assert result is None
def test_extract_priority_order(self, aggregator):
"""Test that DOCM is prioritized for AA change, CADD for gene name."""
variant_data = {
"docm": {"gene": "BRAF", "aa_change": "p.V600E"},
"hgvsp": ["p.Val600Lys"], # Different change
"cadd": {
"gene": {"genename": "WRONG", "prot": {"protpos": 600}},
"oaa": "V",
"naa": "K",
},
}
result = aggregator._extract_gene_aa_change(variant_data)
# CADD is prioritized for gene name, DOCM for AA change
assert result == "WRONG V600E"
def test_extract_regex_with_val_ala(self, aggregator):
"""Test regex extraction when Val/Ala are present."""
# The code specifically looks for Val or Ala to trigger regex
variant_data = {
"cadd": {"gene": {"genename": "TEST1"}},
"hgvsp": ["p.Val600Ala"],
}
result = aggregator._extract_gene_aa_change(variant_data)
# The regex doesn't find a match in "Val600Ala" because it's looking for [A-Z]\d+[A-Z]
# which would match "V600A" but not "Val600Ala"
assert result == "TEST1 Val600Ala"
def test_extract_handles_exceptions_gracefully(self, aggregator):
"""Test that exceptions are handled gracefully."""
# This should trigger an exception internally but return None
variant_data = {
"cadd": {"gene": {"genename": "GENE"}},
"hgvsp": None, # This will cause issues
}
result = aggregator._extract_gene_aa_change(variant_data)
assert result is None