utils.ts•20.6 kB
import {
DrugLabel,
GoogleScholarArticle,
PubMedArticle,
RxNormDrug,
WHOIndicator,
} from "./types.js";
import superagent from "superagent";
import puppeteer from "puppeteer";
import {
FDA_API_BASE,
GOOGLE_SCHOLAR_API_BASE,
PUBMED_API_BASE,
RXNAV_API_BASE,
USER_AGENT,
WHO_API_BASE,
PBS_API_BASE,
} from "./constants.js";
export async function searchDrugs(
query: string,
limit: number = 10,
): Promise<DrugLabel[]> {
const res = await superagent
.get(`${FDA_API_BASE}/drug/label.json`)
.query({
search: `openfda.brand_name:${query}`,
limit: limit,
})
.set("User-Agent", USER_AGENT);
return res.body.results || [];
}
export async function getDrugByNDC(ndc: string): Promise<DrugLabel | null> {
// Normalize common NDC formats and try multiple searchable fields.
const normalizeNdc = (
input: string,
): {
raw: string;
hyphen: string;
digits10: string;
digits11: string;
productHyphen: string;
hyphen10Candidates: string[];
} => {
const raw = String(input || "").trim();
const digits = raw.replace(/[^0-9]/g, "");
// Best-effort hyphenation detection: prefer 5-4-2 (labeler-product-package) when 11 digits
let hyphen = raw;
if (/^\d{11}$/.test(digits)) {
hyphen = `${digits.slice(0, 5)}-${digits.slice(5, 9)}-${digits.slice(9)}`;
} else if (/^\d{10}$/.test(digits)) {
// 10-digit could be 4-4-2, 5-3-2, 5-4-1. We will leave as raw; FDA often indexes 10-digit without hyphens too
hyphen = raw.includes("-") ? raw : raw;
} else if (/^\d{4,5}-\d{3,4}-\d{1,2}$/.test(raw)) {
hyphen = raw;
}
const digits10 = /^\d{10}$/.test(digits) ? digits : "";
const digits11 = /^\d{11}$/.test(digits) ? digits : "";
// If we have a 3-segment hyphenated value, derive the 2-segment product code (labeler-product)
let productHyphen = "";
const m = hyphen.match(/^(\d{4,5}-\d{3,4})-\d{1,2}$/);
if (m) productHyphen = m[1];
// Derive 10-digit hyphen patterns if we have 10 digits only
const hyphen10Candidates: string[] = [];
if (digits10) {
const d = digits10;
// 4-4-2
hyphen10Candidates.push(`${d.slice(0, 4)}-${d.slice(4, 8)}-${d.slice(8)}`);
// 5-3-2
hyphen10Candidates.push(`${d.slice(0, 5)}-${d.slice(5, 8)}-${d.slice(8)}`);
// 5-4-1
hyphen10Candidates.push(`${d.slice(0, 5)}-${d.slice(5, 9)}-${d.slice(9)}`);
}
return { raw, hyphen, digits10, digits11, productHyphen, hyphen10Candidates };
};
const { raw, hyphen, digits10, digits11, productHyphen, hyphen10Candidates } = normalizeNdc(ndc);
// Try a sequence of exact field matches with quotes for reliability
const attempts: Array<{ field: string; value: string }> = [];
const pushAttempt = (field: string, value?: string) => {
const v = (value || "").trim();
if (!v) return;
const key = `${field}|${v}`;
// de-duplicate
if (!attempts.find((a) => `${a.field}|${a.value}` === key)) {
attempts.push({ field, value: v });
}
};
// Prefer hyphenated first (most reliable)
[hyphen, ...hyphen10Candidates, productHyphen, raw, digits10, digits11].forEach((v) => {
if (v) pushAttempt("openfda.product_ndc", v);
});
[hyphen, ...hyphen10Candidates, raw, digits10, digits11].forEach((v) => {
if (v) pushAttempt("openfda.package_ndc", v);
});
for (const { field, value } of attempts) {
try {
const res = await superagent
.get(`${FDA_API_BASE}/drug/label.json`)
.query({
search: `${field}:"${value}"`,
limit: 1,
})
.set("User-Agent", USER_AGENT);
const found = res.body.results?.[0] || null;
if (found) return found;
} catch (e) {
// continue to next attempt
}
// Unquoted fallback
try {
const res = await superagent
.get(`${FDA_API_BASE}/drug/label.json`)
.query({
search: `${field}:${value}`,
limit: 1,
})
.set("User-Agent", USER_AGENT);
const found = res.body.results?.[0] || null;
if (found) return found;
} catch (e) {
// continue to next attempt
}
}
// Final OR-combined attempt across a few common forms
const orParts = [hyphen, ...hyphen10Candidates, productHyphen, raw, digits10, digits11]
.filter(Boolean)
.flatMap((v) => [
`openfda.product_ndc:${v}`,
`openfda.package_ndc:${v}`,
]);
if (orParts.length) {
try {
const res = await superagent
.get(`${FDA_API_BASE}/drug/label.json`)
.query({ search: orParts.join("+OR+"), limit: 1 })
.set("User-Agent", USER_AGENT);
return res.body.results?.[0] || null;
} catch (e) {
// ignore
}
}
return null;
}
// WHO API functions
export async function getHealthIndicators(
indicatorName: string,
country?: string,
): Promise<WHOIndicator[]> {
const resolveCountryCode = (input?: string): string | undefined => {
if (!input) {
return undefined;
}
const normalized = String(input).trim().toUpperCase();
const map: Record<string, string> = {
AUS: "AUS",
AUSTRALIA: "AUS",
AU: "AUS",
USA: "USA",
US: "USA",
"UNITED STATES": "USA",
GBR: "GBR",
UK: "GBR",
"UNITED KINGDOM": "GBR",
NZ: "NZL",
NZL: "NZL",
"NEW ZEALAND": "NZL",
CANADA: "CAN",
CA: "CAN",
CAN: "CAN",
INDIA: "IND",
IND: "IND",
JAPAN: "JPN",
JPN: "JPN",
CHINA: "CHN",
CHN: "CHN",
"SOUTH AFRICA": "ZAF",
ZAF: "ZAF",
};
return map[normalized] || (normalized.length === 3 ? normalized : undefined);
};
// Escape single quotes per OData rules
const escapedName = indicatorName.replace(/'/g, "''");
// 1) Try to find exact indicator by name
let code: string | undefined;
try {
const exactRes = await superagent
.get(`${WHO_API_BASE}/Indicator`)
.query({
$filter: `IndicatorName eq '${escapedName}'`,
$top: 1,
$format: "json",
})
.set("User-Agent", USER_AGENT);
code = exactRes.body.value?.[0]?.IndicatorCode;
} catch {
// ignore
}
// 2) Fallback to contains() search if exact not found
if (!code) {
try {
const containsRes = await superagent
.get(`${WHO_API_BASE}/Indicator`)
.query({
$filter: `contains(IndicatorName,'${escapedName}')`,
$top: 1,
$format: "json",
})
.set("User-Agent", USER_AGENT);
code = containsRes.body.value?.[0]?.IndicatorCode;
} catch {
// ignore
}
}
if (!code) {
return [];
}
// 3) Query the indicator-specific endpoint, optionally filter by country and both-sexes
const filters: string[] = [];
const defaultIso = process.env.DEFAULT_COUNTRY_ISO3;
const iso3 = resolveCountryCode(country || defaultIso);
if (iso3) filters.push(`SpatialDim eq '${iso3}'`);
// Prefer both sexes when present
filters.push(`(Dim1 eq 'SEX_BTSX' or Dim1 eq null)`);
const filter = filters.length ? filters.join(" and ") : undefined;
const query: Record<string, string> = {
$orderby: "TimeDim desc",
$top: "200",
$format: "json",
};
if (filter) query.$filter = filter;
try {
const dataRes = await superagent
.get(`${WHO_API_BASE}/${code}`)
.query(query)
.set("User-Agent", USER_AGENT);
return dataRes.body.value || [];
} catch (err) {
return [];
}
}
export async function listWhoIndicators(term: string): Promise<{ code: string; name: string }[]> {
const escaped = term.replace(/'/g, "''");
try {
const res = await superagent
.get(`${WHO_API_BASE}/Indicator`)
.query({
$filter: `contains(IndicatorName,'${escaped}')`,
$top: 50,
$format: "json",
})
.set("User-Agent", USER_AGENT);
const list = res.body.value || [];
return list.map((it: any) => ({ code: it.IndicatorCode, name: it.IndicatorName }));
} catch (e) {
return [];
}
}
// RxNorm API functions
export async function searchRxNormDrugs(query: string): Promise<RxNormDrug[]> {
try {
const res = await superagent
.get(`${RXNAV_API_BASE}/drugs.json`)
.query({ name: query })
.set("User-Agent", USER_AGENT);
const groups: any[] = res.body?.drugGroup?.conceptGroup || [];
const concepts: any[] = [];
for (const group of groups) {
if (Array.isArray(group?.conceptProperties)) {
concepts.push(...group.conceptProperties);
}
if (Array.isArray(group?.concept)) {
concepts.push(...group.concept);
}
if (Array.isArray(group?.minConcept)) {
concepts.push(...group.minConcept);
}
}
const normalize = (c: any): RxNormDrug => ({
rxcui: c.rxcui || c.rxCui || "",
name: c.name || c.term || "",
synonym: Array.isArray(c.synonym)
? c.synonym
: typeof c.synonym === "string"
? c.synonym.split("|")
: [],
tty: c.tty || c.termType || "",
language: c.language || "",
suppress: c.suppress || "",
umlscui: Array.isArray(c.umlscui)
? c.umlscui
: typeof c.umlscui === "string"
? c.umlscui.split("|")
: [],
});
return concepts
.filter((c) => (c?.name || c?.term) && (c?.rxcui || c?.rxCui))
.map(normalize);
} catch (error) {
return [];
}
}
// Utility function to add random delay
function randomDelay(min: number, max: number): Promise<void> {
const delay = Math.random() * (max - min) + min;
return new Promise((resolve) => setTimeout(resolve, delay));
}
// Google Scholar API functions
export async function searchGoogleScholar(
query: string,
): Promise<GoogleScholarArticle[]> {
const serpApiKey = process.env.SERPAPI_KEY;
// Prefer SerpAPI if available (more reliable than scraping)
if (serpApiKey) {
try {
const res = await superagent
.get("https://serpapi.com/search.json")
.query({ engine: "google_scholar", q: query, api_key: serpApiKey });
const items: any[] = res.body?.organic_results || [];
return items.map((it) => ({
title: it.title || "",
authors: it.publication_info?.summary || it.authors?.map((a: any) => a.name).join(", "),
abstract: it.snippet || "",
journal: it.publication || it.journal || "",
year: it.year ? String(it.year) : undefined,
citations: it.inline_links?.cited_by?.total ? `Cited by ${it.inline_links.cited_by.total}` : undefined,
url: it.link || it.resources?.[0]?.link,
}));
} catch (error) {
if (process.env.DEBUG_SCHOLAR) {
console.warn("SerpAPI Google Scholar fallback failed:", error);
}
// fall through to Puppeteer
}
}
let browser;
try {
// Add a small random delay to avoid rate limiting
await randomDelay(1000, 3000);
browser = await puppeteer.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--no-first-run",
"--no-zygote",
"--disable-gpu",
"--disable-web-security",
"--disable-features=VizDisplayCompositor",
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
],
});
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
);
await page.setExtraHTTPHeaders({
"Accept-Language": "en-US,en;q=0.9",
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Cache-Control": "no-cache",
Pragma: "no-cache",
});
const searchUrl = `${GOOGLE_SCHOLAR_API_BASE}?q=${encodeURIComponent(query)}&hl=en`;
await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 45000 });
try {
await page.waitForSelector(".gs_r, .gs_ri, [data-rp]", { timeout: 20000 });
} catch (error) {
const hasAny = await page.$(".gs_r, .gs_ri, [data-rp]");
if (!hasAny) {
throw new Error("No search results found or page structure changed");
}
}
return await page.evaluate(() => {
const results: GoogleScholarArticle[] = [];
const articleElements = document.querySelectorAll(
".gs_r, .gs_ri, [data-rp]",
);
articleElements.forEach((element) => {
const titleElement =
element.querySelector(".gs_rt a, .gs_rt, h3 a, h3") ||
element.querySelector("a[data-clk]") ||
element.querySelector("h3");
const title = titleElement?.textContent?.trim() || "";
const url = (titleElement as HTMLAnchorElement)?.href || "";
const authorsElement =
element.querySelector(".gs_a, .gs_authors, .gs_venue") ||
element.querySelector('[class*="author"]') ||
element.querySelector('[class*="venue"]');
const authors = authorsElement?.textContent?.trim() || "";
const abstractElement =
element.querySelector(".gs_rs, .gs_rs_a, .gs_snippet") ||
element.querySelector('[class*="snippet"]') ||
element.querySelector('[class*="abstract"]');
const abstract = abstractElement?.textContent?.trim() || "";
const citationsElement =
element.querySelector(".gs_fl a, .gs_fl") ||
element.querySelector('[class*="citation"]') ||
element.querySelector('a[href*="cites"]');
const citations = citationsElement?.textContent?.trim() || "";
let year = "";
const yearMatch =
authors.match(/(\d{4})/) ||
title.match(/(\d{4})/) ||
abstract.match(/(\d{4})/);
if (yearMatch) {
year = yearMatch[1];
}
let journal = "";
const journalMatch =
authors.match(/- ([^-]+)$/) ||
authors.match(/, ([^,]+)$/) ||
authors.match(/in ([^,]+)/);
if (journalMatch) {
journal = journalMatch[1].trim();
}
if (title && title.length > 5) {
results.push({
title,
authors,
abstract,
journal,
year,
citations,
url,
});
}
});
return results;
});
} catch (error) {
if (process.env.DEBUG_SCHOLAR) {
console.warn("Error scraping Google Scholar:", error);
}
return [];
} finally {
if (browser) {
await browser.close();
}
}
}
export async function searchPubMedArticles(
query: string,
maxResults: number = 10,
): Promise<PubMedArticle[]> {
try {
// First, search for article IDs
const searchRes = await superagent
.get(`${PUBMED_API_BASE}/esearch.fcgi`)
.query({
db: "pubmed",
term: query,
retmode: "json",
retmax: maxResults,
})
.set("User-Agent", USER_AGENT);
const idList = searchRes.body.esearchresult?.idlist || [];
if (idList.length === 0) return [];
// Then, fetch article details
const fetchRes = await superagent
.get(`${PUBMED_API_BASE}/efetch.fcgi`)
.query({
db: "pubmed",
id: idList.join(","),
retmode: "xml",
})
.set("User-Agent", USER_AGENT);
// Parse XML response (simplified)
const articles: PubMedArticle[] = [];
const xmlText = fetchRes.text;
// Simple XML parsing for demonstration
const pmidMatches = xmlText.match(/<PMID[^>]*>(\d+)<\/PMID>/g);
const titleMatches = xmlText.match(
/<ArticleTitle[^>]*>([^<]+)<\/ArticleTitle>/g,
);
if (pmidMatches && titleMatches) {
for (
let i = 0;
i < Math.min(pmidMatches.length, titleMatches.length);
i++
) {
const pmid = pmidMatches[i].match(/<PMID[^>]*>(\d+)<\/PMID>/)?.[1];
const title = titleMatches[i].match(
/<ArticleTitle[^>]*>([^<]+)<\/ArticleTitle>/,
)?.[1];
if (pmid && title) {
articles.push({
pmid,
title,
abstract: "Abstract not available in this format",
authors: [],
journal: "Journal information not available",
publication_date: "Date not available",
});
}
}
}
return articles;
} catch (error) {
return [];
}
}
// PBS (Australia) Public API helpers
let pbsLastCallAt: number | null = null;
function normalizeBaseUrl(base: string): string {
return base.endsWith("/") ? base.slice(0, -1) : base;
}
export async function pbsGet(
endpoint: string,
queryParams?: Record<string, string | number | boolean>,
): Promise<unknown> {
const base = PBS_API_BASE;
if (!base) {
throw new Error(
"PBS_API_BASE is not set. Please set the base URL for the PBS public API (e.g., https://data-api.health.gov.au/pbs/api/v3).",
);
}
const minIntervalMs = Number(process.env.PBS_MIN_INTERVAL_MS || 20000);
const now = Date.now();
if (pbsLastCallAt) {
const elapsed = now - pbsLastCallAt;
if (elapsed < minIntervalMs) {
const waitMs = minIntervalMs - elapsed + 50;
await randomDelay(waitMs, waitMs + 10);
}
}
// Allow callers to pass endpoints with or without api/v3 prefix
const normalizedEndpoint = endpoint
.replace(/^\//, "")
.replace(/^api\/v3\//, "");
const url = `${normalizeBaseUrl(base)}/${normalizedEndpoint}`;
const req = superagent.get(url).query(queryParams || {}).set("User-Agent", USER_AGENT);
const subKey = process.env.PBS_SUBSCRIPTION_KEY;
if (!subKey) {
console.warn("Warning: PBS_SUBSCRIPTION_KEY is not set; requests may be rejected by the API.");
} else {
req.set("Subscription-Key", subKey);
}
const res = await req.timeout({
response: 30000,
deadline: 60000,
});
pbsLastCallAt = Date.now();
try {
return res.body ?? JSON.parse(res.text);
} catch {
return res.text;
}
}
// Simple in-memory TTL cache for PBS GETs
type CacheEntry = { value: unknown; expiresAt: number };
const pbsCache = new Map<string, CacheEntry>();
const PBS_CACHE_MAX_ENTRIES = 200;
function buildCacheKey(endpoint: string, queryParams?: Record<string, string | number | boolean>): string {
const normalizedEndpoint = String(endpoint).replace(/\/$/, "");
const qp = queryParams || {};
const keys = Object.keys(qp).sort();
const parts = keys.map((k) => `${k}=${String((qp as any)[k])}`);
return `${normalizedEndpoint}?${parts.join("&")}`;
}
function pruneCacheIfNeeded() {
if (pbsCache.size <= PBS_CACHE_MAX_ENTRIES) return;
// delete oldest entries
const toDelete = pbsCache.size - PBS_CACHE_MAX_ENTRIES;
let i = 0;
for (const key of pbsCache.keys()) {
pbsCache.delete(key);
if (++i >= toDelete) break;
}
}
export async function pbsGetCached(
endpoint: string,
queryParams?: Record<string, string | number | boolean>,
ttlMs: number = Number(process.env.PBS_CACHE_TTL_MS || 5 * 60 * 1000),
): Promise<unknown> {
const key = buildCacheKey(endpoint, queryParams);
const now = Date.now();
const hit = pbsCache.get(key);
if (hit && hit.expiresAt > now) return hit.value;
const value = await pbsGet(endpoint, queryParams);
pbsCache.set(key, { value, expiresAt: now + ttlMs });
pruneCacheIfNeeded();
return value;
}
/**
* Resolve the latest schedule_code using the PBS schedules endpoint.
* Falls back to throwing an error if no schedule is returned.
*/
export async function resolveLatestScheduleCode(): Promise<string | number> {
const data = (await pbsGetCached("schedules", { get_latest_schedule_only: "true", limit: 1 })) as any;
const rows = Array.isArray(data?.data) ? data.data : (data?.data ? [data.data] : []);
const latest = rows[0];
const schedule = latest?.schedule_code;
if (!schedule) {
throw new Error("Could not resolve latest PBS schedule_code");
}
return schedule;
}
/**
* Fetch a single PBS item by item code (pbs_code), optionally constrained to a schedule.
*/
export async function getItemByCode(
pbsCode: string,
scheduleCode?: string | number,
): Promise<any | null> {
const params: Record<string, string> = {
pbs_code: String(pbsCode),
limit: "1",
};
if (scheduleCode != null) params.schedule_code = String(scheduleCode);
const resp = (await pbsGetCached("items", params)) as any;
return resp?.data?.[0] ?? null;
}