Skip to main content
Glama

GenAIScript

Official
by microsoft
MIT License
43
2,820
  • Linux
  • Apple
parse.ts14.5 kB
import replaceExt from "replace-ext" import { readFile, writeFile } from "node:fs/promises" import { DOCXTryParse } from "../../core/src/docx" import { extractFenced } from "../../core/src/fence" import { expandFiles, writeText, readText, tryReadText, } from "../../core/src/fs" import { HTMLToMarkdown, HTMLToText } from "../../core/src/html" import { isJSONLFilename, JSONLTryParse } from "../../core/src/jsonl" import { parsePdf } from "../../core/src/pdf" import { estimateTokens } from "../../core/src/tokens" import { YAMLStringify } from "../../core/src/yaml" import { resolveTokenEncoder } from "../../core/src/encoders" import { CONSOLE_TOKEN_COLORS, MD_REGEX, PROMPTY_REGEX, } from "../../core/src/constants" import { promptyParse, promptyToGenAIScript } from "../../core/src/prompty" import { basename, join } from "node:path" import { CSVStringify, dataToMarkdownTable } from "../../core/src/csv" import { INIStringify } from "../../core/src/ini" import { JSON5Stringify } from "../../core/src/json5" import { jinjaRender } from "../../core/src/jinja" import { splitMarkdown } from "../../core/src/frontmatter" import { parseOptionsVars } from "./vars" import { dataTryParse } from "../../core/src/data" import { resolveFileContent } from "../../core/src/file" import { redactSecrets } from "../../core/src/secretscanner" import { ellipse, logVerbose } from "../../core/src/util" import { chunkMarkdown } from "../../core/src/mdchunk" import { normalizeInt } from "../../core/src/cleaners" import { prettyBytes } from "../../core/src/pretty" import { terminalSize } from "../../core/src/terminal" import { consoleColors, wrapColor } from "../../core/src/consolecolor" import { genaiscriptDebug } from "../../core/src/debug" import { stderr, stdout } from "../../core/src/stdio" const dbg = genaiscriptDebug("cli:parse") /** * This module provides various parsing utilities for different file types such * as PDF, DOCX, HTML, JSONL, and more. It includes functions to extract and * convert data, estimate tokens, and transform file formats. */ /** * Extracts and logs fenced code blocks of a specific language from a file. * Filters the fenced blocks by the specified language and logs their content. * @param language - The language to filter the fenced blocks by. * @param file - The file to parse and extract fenced code blocks from. */ export async function parseFence(language: string, file: string) { const res = await resolveFileContent({ filename: file }) const fences = extractFenced(res.content || "").filter( (f) => f.language === language ) // Logs the content of the filtered fenced blocks console.log(fences.map((f) => f.content).join("\n\n")) } /** * Parses the contents of a PDF file and outputs them in text format. * Optionally writes the content and page images to the specified output directory. * If an output directory is specified, the text content is saved as a .txt file, * and page images (if any) are saved as .png files. * Logs the writing process for each file. * If no output directory is specified, logs the text content to the console. * @param file - The PDF file to parse. * @param options - Options to include images and specify the output directory. * - images: Whether to include page images in the output. * - out: The output directory where files will be saved. */ export async function parsePDF( file: string, options: { images: boolean; out: string } ) { const { images, out } = options const { content, pages } = await parsePdf(file, { renderAsImage: images }) if (out) { const fn = basename(file) console.log(`writing ${join(out, fn + ".txt")}`) await writeText(join(out, fn + ".txt"), content || "") for (const page of pages) { if (page.image) { const n = join(out, fn + ".page" + page.index + ".png") console.log(`writing ${n}`) await writeFile(n, page.image) } } } else { console.log(content || "") } } /** * Parses the contents of a DOCX file and logs the extracted text. * If an error occurs during parsing, it logs the error. * Uses DOCXTryParse to extract text from the DOCX file. * @param file - The path to the DOCX file to parse. * @param options - Options for parsing the DOCX file. */ export async function parseDOCX(file: string, options: DocxParseOptions) { // Uses DOCXTryParse to extract text from the DOCX file const res = await DOCXTryParse(file, options) if (res.error) console.error(res.error) else console.log(res.file.content) } /** * Converts HTML content to text and logs it or writes it to a file. * @param fileOrUrl - The HTML file or URL to convert. * @param options - Options to specify the output format ("markdown" or "text") and the output file path. */ export async function parseHTMLToText( fileOrUrl: string, options: { format?: "markdown" | "text"; out?: string } ) { const { format = "markdown", out } = options || {} const file: WorkspaceFile = { filename: fileOrUrl } await resolveFileContent(file) // Converts HTML to plain text let text: string if (format === "markdown") text = await HTMLToMarkdown(file.content) else text = await HTMLToText(file.content) if (out) { logVerbose(`writing ${out}`) await writeText(out, text) } else console.log(text) } /** * Parses a Jinja2 file, substitutes variables, and logs the rendered output. * * @param file - The path to the Jinja2 template file to parse. * @param options - An object containing the following properties: * - vars: An array of key-value pairs in the format "key=value" to replace variables in the template. * * The function reads the template file, processes it based on its type (Prompty or Markdown), * substitutes the provided variables, and renders the output. Variable values are converted * to numbers if possible. Environment variables are also considered during substitution. */ export async function parseJinja2( file: string, options: { vars: string[] } ) { let src = await readFile(file, { encoding: "utf-8" }) if (PROMPTY_REGEX.test(file)) src = promptyParse(file, src).content else if (MD_REGEX.test(file)) src = splitMarkdown(src).content const vars: Record<string, any> = parseOptionsVars( options.vars, process.env ) for (const k in vars) { const i = parseFloat(vars[k]) if (!isNaN(i)) vars[k] = i } const res: string = jinjaRender(src, vars) console.log(res) } /** * Parses the input file and converts its data into a specified format. * * @param file - Path to the file to be read and parsed. * @param options - Configuration options for the output format. * @param options.format - The target format for the output. Supported formats include: * - "yaml": Converts data to YAML format. * - "ini": Converts data to INI format. * - "csv": Converts data into a CSV format with a header row. * - "md" or "markdown": Converts data into a Markdown table. * - "json5": Converts data into JSON5 format. * - Default: Outputs data as a prettified JSON string. * * Logs the converted data to the console. * Throws an error if the data format cannot be determined. */ export async function parseAnyToJSON( file: string, options: { format: string } ) { const data = await dataTryParse({ filename: file }) if (!data) throw new Error(`Unknown data format for ${file}`) let out: string switch (options?.format?.toLowerCase() || "") { case "yaml": out = YAMLStringify(data) break case "ini": out = INIStringify(data) break case "csv": out = CSVStringify(data, { header: true }) break case "md": case "markdown": out = dataToMarkdownTable(data) break case "json5": out = JSON5Stringify(data, null, 2) break default: out = JSON.stringify(data, null, 2) break } console.log(out) } /** * Converts JSONL files to JSON files. * Processes an array of files or glob patterns, skipping non-JSONL files, * and writes the converted JSON content to new files with a ".json" extension. * Logs the conversion process for each file. * @param files - An array of files or glob patterns to process. */ export async function jsonl2json(files: string[]) { for (const file of await expandFiles(files, { applyGitIgnore: false })) { if (!isJSONLFilename(file)) { // Skips files that are not JSONL console.log(`skipping ${file}`) continue } const content = await tryReadText(file) const objs = await JSONLTryParse(content, { repair: true }) const out: string = replaceExt(file, ".json") await writeText(out, JSON.stringify(objs, null, 2)) console.log(`${file} -> ${out}`) } } /** * Estimates the number of tokens in the content of files and logs the results. * @param filesGlobs - An array of files or glob patterns to process. * @param options - Options for processing files. * - excludedFiles - A list of files to exclude from processing. * - model - The name of the model used for token encoding. * - ignoreGitIgnore - Whether to ignore .gitignore rules when expanding files. */ export async function parseTokens( filesGlobs: string[], options: { excludedFiles: string[] model: string ignoreGitIgnore: boolean } ) { const { model } = options || {} const { encode: encoder } = await resolveTokenEncoder(model) const files = await expandFiles(filesGlobs, options) console.log(`parsing ${files.length} files`) let text = "" for (const file of files) { const content = await readText(file) if (content) { const tokens = estimateTokens(content, encoder) console.log(`${file}, ${tokens}`) text += `${file}, ${tokens}\n` } } // Logs the aggregated text with file names and token estimates console.log(text) } /** * Tokenizes the content of a specified file using a provided model and logs the tokens. * * @param file - Path to the file to tokenize. * @param options - Object containing the following properties: * - model - The name of the model used for token encoding. * * The function reads the content of the file, tokenizes it using the given model, * and logs each token along with its hexadecimal representation. * Debug information about the process is also logged. */ export async function parseTokenize(file: string, options: { model: string }) { const text = await readText(file) dbg(`text: %s`, text) const { model } = options || {} const { model: tokenModel, encode: encoder, decode: decoder, } = await resolveTokenEncoder(model) console.debug(`model: %s`, tokenModel) const tokens = encoder(text) for (const token of tokens) { stdout.write( `(${wrapColor(CONSOLE_TOKEN_COLORS[0], decoder([token]))}, x${wrapColor(CONSOLE_TOKEN_COLORS[1], token.toString(16))})` ) } } /** * Converts "prompty" format files to GenAI script files. * * @param files - An array of file paths to process. * @param options - An object containing the following properties: * - out: The output directory where the converted files will be written. * * Logs the conversion process and writes the output files to the specified directory or replaces the extension in place if no directory is provided. */ export async function prompty2genaiscript( files: string[], options: { out: string } ) { const { out } = options const fs = await expandFiles(files) for (const f of fs) { const gf = out ? join(out, replaceExt(basename(f), ".genai.mts")) : replaceExt(f, ".genai.mts") console.log(`${f} -> ${gf}`) const content = await readText(f) const doc = promptyParse(f, content) const script: string = promptyToGenAIScript(doc) await writeText(gf, script) } } /** * Scans a list of files for sensitive information or secrets. * Logs each file containing secrets and the types of secrets found. * Issues a warning if secrets are found in any files. * * @param files - A list of file paths or glob patterns to scan. * Logs the file name and the types of secrets found in each file. * Warns if secrets are found in any of the scanned files. */ export async function parseSecrets(files: string[]) { const fs = await expandFiles(files) let n = 0 for (const f of fs) { const content = await readText(f) const { found } = redactSecrets(content) const entries = Object.entries(found) if (entries.length) { n++ console.log( `${f}: ${entries.map(([k, v]) => `${k} (${v})`).join(", ")}` ) } } if (n > 0) console.warn(`found secrets in ${n} of ${fs.length} files`) } /** * Parses a markdown file, breaks it into chunks based on token limits, and logs a preview of each chunk. * * @param filename - The name of the markdown file to parse. * @param options - Object containing parsing options. * - model - The model name used for token encoding. * - maxTokens - The maximum number of tokens allowed per chunk. * - disableFallback - Whether to disable fallback for token encoding. */ export async function parseMarkdown( filename: string, options: { model: string; maxTokens: string } ) { const maxTokens = normalizeInt(options.maxTokens) || 1024 const file: WorkspaceFile = { filename } await resolveFileContent(file) if (file.size) console.debug(`file: ${prettyBytes(file.size)}`) const encoding = await resolveTokenEncoder(options?.model, { disableFallback: false, }) const res = await chunkMarkdown( file, (text) => encoding.encode(text).length, { maxTokens, } ) const cols = terminalSize().columns for (const { content, filename, lineStart, lineEnd } of res) { const prefix = `${basename(filename)} (${lineStart}-${lineEnd}): ` console.log( `${prefix}${ellipse(content.replace(/\n/g, " "), cols - prefix.length)}` ) } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/microsoft/genaiscript'

If you have feedback or need assistance with the MCP directory API, please join our Discord server