Skip to main content
Glama

Storyden

by Southclaws
Mozilla Public License 2.0
227
reader.go3.52 kB
package pdf import ( "context" _ "embed" "log/slog" "strings" "sync" "time" "github.com/Southclaws/fault" "github.com/Southclaws/fault/fctx" "github.com/klippa-app/go-pdfium" "github.com/klippa-app/go-pdfium/requests" "github.com/klippa-app/go-pdfium/webassembly" "go.uber.org/fx" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) type Extractor struct { logger *slog.Logger pool pdfium.Pool instance pdfium.Pdfium initlock sync.Mutex firstUse bool readyChan chan bool } func New(lc fx.Lifecycle, logger *slog.Logger) (*Extractor, error) { e := Extractor{ readyChan: make(chan bool, 1), } lc.Append(fx.StartHook(func(ctx context.Context) error { init := func() error { start := time.Now() pool, err := webassembly.Init(webassembly.Config{ MinIdle: 1, MaxIdle: 1, MaxTotal: 1, }) if err != nil { return err } instance, err := pool.GetInstance(time.Second * 30) if err != nil { return err } e.initlock.Lock() defer e.initlock.Unlock() e.pool = pool e.instance = instance logger.Debug("pdf worker pool initialised", slog.Duration("time_taken", time.Since(start))) e.readyChan <- true return nil } go func() { if err := init(); err != nil { logger.Error("failed to initialize PDFium worker pool", slog.String("error", err.Error())) } }() return nil })) return &e, nil } func (e *Extractor) waitForReady() error { if e.firstUse { return nil } e.initlock.Lock() defer e.initlock.Unlock() select { case <-e.readyChan: e.firstUse = true case <-time.After(time.Second * 30): if e.firstUse { return nil } return fault.New("PDFium worker pool did not initialize within 30 seconds") } return nil } type ExtractionResult struct { Text string HTML *html.Node } func (e *Extractor) Extract(ctx context.Context, buf []byte) (*ExtractionResult, error) { if err := e.waitForReady(); err != nil { return nil, err } doc, err := e.instance.OpenDocument(&requests.OpenDocument{ File: &buf, }) if err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } defer func() { _, err := e.instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) if err != nil { e.logger.Error("failed to close document", slog.String("error", err.Error())) } }() pages, err := e.instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{ Document: doc.Document, }) if err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } blocks := []string{} root := html.Node{ Type: html.ElementNode, Data: "body", DataAtom: atom.Body, } for p := range pages.PageCount { pageNode := &html.Node{ Type: html.ElementNode, Data: "p", DataAtom: atom.P, } pageText, err := e.instance.GetPageText(&requests.GetPageText{ Page: requests.Page{ ByIndex: &requests.PageByIndex{ Document: doc.Document, Index: p, }, }, }) if err != nil { return nil, err } data := strip(pageText.Text) pageNode.AppendChild(&html.Node{ Type: html.TextNode, Data: data, }) root.AppendChild(pageNode) blocks = append(blocks, pageText.Text) } return &ExtractionResult{ Text: strings.Join(blocks, "\n"), HTML: &root, }, nil } func strip(s string) string { trim := strings.TrimSpace(s) newlines := strings.ReplaceAll(trim, "\n", " ") tabs := strings.ReplaceAll(newlines, "\t", " ") doublespace := strings.Join(strings.Fields(tabs), " ") return doublespace }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Southclaws/storyden'

If you have feedback or need assistance with the MCP directory API, please join our Discord server