scrap.ts•778 B
// TOOL TO SCRAP A WEBPAGE
/**
* Fetches the raw HTML of a webpage and extracts text content
* matching the provided regular expression. By default it
* collects the text inside all <h2> elements which is useful
* for grabbing headlines from simple news sites.
*/
export async function scrap(
url: string,
options?: { pattern?: RegExp; signal?: AbortSignal }
): Promise<string[]> {
const { pattern = /<h2[^>]*>(.*?)<\/h2>/g, signal } = options || {};
const res = await fetch(url, { signal });
const html = await res.text();
const results: string[] = [];
let match: RegExpExecArray | null;
while ((match = pattern.exec(html)) !== null) {
const text = match[1].replace(/<[^>]+>/g, "").trim();
if (text) results.push(text);
}
return results;
}