analyze_toxicity_structure.py•5.58 kB
"""
分析PubChem Toxicity数据的详细结构
"""
import json
from pathlib import Path
def find_toxicity_section(sections, depth=0):
"""递归查找Toxicity相关的Section"""
toxicity_data = {}
for section in sections:
heading = section.get("TOCHeading", "")
if "Toxicity" in heading:
print("\n" + "="*80)
print(f"发现Toxicity Section: {heading}")
print("="*80)
# 如果有Information字段,打印详细内容
if "Information" in section:
print(f"\n直接包含 {len(section['Information'])} 条Information:")
for i, info in enumerate(section["Information"][:5]): # 只显示前5条
print(f"\n [{i+1}] Name: {info.get('Name', 'N/A')}")
print(f" Description: {info.get('Description', 'N/A')[:100]}")
print(f" StringValue: {info.get('StringValue', 'N/A')[:100]}")
# 检查Value字段
if "Value" in info:
value = info["Value"]
print(f" Value类型: {type(value)}")
if isinstance(value, dict):
print(f" Value keys: {list(value.keys())}")
if "StringWithMarkup" in value:
markup = value["StringWithMarkup"]
if isinstance(markup, list) and markup:
print(f" 第一个StringWithMarkup内容:")
first_markup = markup[0]
if isinstance(first_markup, dict):
print(f" String: {first_markup.get('String', 'N/A')[:100]}")
# 检查ReferenceNumber字段
if "ReferenceNumber" in info:
print(f" ReferenceNumber: {info['ReferenceNumber']}")
# 递归检查子Section
if "Section" in section:
subsections = section["Section"]
print(f"\n包含 {len(subsections)} 个子Section:")
for subsection in subsections:
sub_heading = subsection.get("TOCHeading", "")
info_count = len(subsection.get("Information", []))
subsub_count = len(subsection.get("Section", []))
print(f" - {sub_heading} (Info: {info_count}, SubSections: {subsub_count})")
# 递归处理
find_toxicity_section(subsections, depth + 1)
def main():
# 分析Ethanol的数据(最详细)
json_file = "pubchem_raw_702_Ethanol.json"
print(f"分析文件: {json_file}")
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
record = data.get("Record", {})
sections = record.get("Section", [])
find_toxicity_section(sections)
print("\n" + "="*80)
print("现在提取一个具体的Toxicity Information示例:")
print("="*80)
# 找到Toxicity Section并提取详细数据
for section in sections:
if section.get("TOCHeading") == "Toxicity":
# 保存整个Toxicity Section到单独的文件
with open("toxicity_section_sample.json", "w", encoding="utf-8") as f:
json.dump(section, f, ensure_ascii=False, indent=2)
print("完整的Toxicity Section已保存到: toxicity_section_sample.json")
# 获取子sections
subsections = section.get("Section", [])
for subsection in subsections:
sub_heading = subsection.get("TOCHeading", "")
# 找到Human Toxicity Values或Non-Human Toxicity Values
if "Non-Human Toxicity Values" in sub_heading:
print(f"\n找到 {sub_heading}:")
with open("non_human_toxicity_values_sample.json", "w", encoding="utf-8") as f:
json.dump(subsection, f, ensure_ascii=False, indent=2)
print(f" 已保存到: non_human_toxicity_values_sample.json")
# 打印前3条记录
if "Information" in subsection:
print(f"\n 包含 {len(subsection['Information'])} 条记录,前3条:")
for i, info in enumerate(subsection["Information"][:3]):
print(f"\n 记录 {i+1}:")
print(f" Name: {info.get('Name', 'N/A')}")
print(f" Description: {info.get('Description', 'N/A')[:150]}")
# 打印Value内容
if "Value" in info:
value = info["Value"]
if isinstance(value, dict) and "StringWithMarkup" in value:
markup_list = value["StringWithMarkup"]
if isinstance(markup_list, list):
for markup in markup_list[:1]: # 只看第一个
if isinstance(markup, dict):
print(f" Value String: {markup.get('String', 'N/A')[:200]}")
if __name__ == "__main__":
main()