Skip to main content
Glama

Storyden

by Southclaws
Mozilla Public License 2.0
229
fetcher.go5 kB
// package fetcher simply grabs URLs, scrapes them and stores them and assets. package fetcher import ( "context" "log/slog" "net/http" "net/url" "github.com/Southclaws/fault" "github.com/Southclaws/fault/fctx" "github.com/Southclaws/fault/ftag" "github.com/Southclaws/opt" "github.com/gosimple/slug" "github.com/Southclaws/storyden/app/resources/asset" "github.com/Southclaws/storyden/app/resources/datagraph" "github.com/Southclaws/storyden/app/resources/link/link_querier" "github.com/Southclaws/storyden/app/resources/link/link_ref" "github.com/Southclaws/storyden/app/resources/link/link_writer" "github.com/Southclaws/storyden/app/resources/message" "github.com/Southclaws/storyden/app/services/asset/asset_upload" "github.com/Southclaws/storyden/app/services/link/scrape" "github.com/Southclaws/storyden/internal/infrastructure/pubsub" ) var errEmptyLink = fault.New("empty link") type Fetcher struct { logger *slog.Logger uploader *asset_upload.Uploader lq *link_querier.LinkQuerier lr *link_writer.LinkWriter sc scrape.Scraper bus *pubsub.Bus } func New( logger *slog.Logger, uploader *asset_upload.Uploader, lq *link_querier.LinkQuerier, lr *link_writer.LinkWriter, sc scrape.Scraper, bus *pubsub.Bus, ) *Fetcher { return &Fetcher{ logger: logger, uploader: uploader, lq: lq, lr: lr, sc: sc, bus: bus, } } type Options struct { ContentFill opt.Optional[asset.ContentFillCommand] } func (s *Fetcher) Fetch(ctx context.Context, u url.URL, opts Options) (*link_ref.LinkRef, error) { if u.String() == "" { return nil, fault.Wrap(errEmptyLink, fctx.With(ctx), ftag.With(ftag.InvalidArgument)) } r, err := s.lq.Search(ctx, 0, 1, link_querier.WithURL(u.String())) if err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } if len(r.Links) > 0 { if err := s.bus.SendCommand(ctx, &message.CommandScrapeLink{URL: u}); err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } return r.Links[0], nil } lr, _, err := s.ScrapeAndStore(ctx, u) if err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } return lr, nil } // HydrateContentURLs takes all the URLs mentioned in the content of an item and // queues them for hydration. This process visits each URL and fetches metadata. // Then, stores that metadata in the database and relates them back to the item. func (s *Fetcher) HydrateContentURLs(ctx context.Context, item datagraph.Item) { urls := item.GetContent().Links() for _, l := range urls { parsed, err := url.Parse(l) if err != nil { continue } err = s.queueForItem(ctx, *parsed, item) if err != nil { continue } } } // queueForItem queues a scrape request for a URL that is linked to an item. // When the scrape job is done, the scraped link will be related to the item. func (s *Fetcher) queueForItem(ctx context.Context, u url.URL, item datagraph.Item) error { if err := s.bus.SendCommand(ctx, &message.CommandScrapeLink{ URL: u, Item: datagraph.NewRef(item), }); err != nil { return fault.Wrap(err, fctx.With(ctx)) } return nil } func (s *Fetcher) ScrapeAndStore(ctx context.Context, u url.URL) (*link_ref.LinkRef, *scrape.WebContent, error) { wc, err := s.sc.Scrape(ctx, u) if err != nil { s.logger.Warn("failed to scrape URL, storing link with basic information only", slog.String("error", err.Error()), slog.String("url", u.String())) ln, err := s.lr.Store(ctx, u.String(), "", "", []link_writer.Option{}...) if err != nil { return nil, nil, fault.Wrap(err, fctx.With(ctx)) } return ln, &scrape.WebContent{}, nil } opts := []link_writer.Option{} if wc.Favicon != "" { a, err := s.CopyAsset(ctx, wc.Favicon) if err != nil { s.logger.Warn("failed to scrape web content favicon image", slog.String("error", err.Error()), slog.String("url", u.String())) } else { opts = append(opts, link_writer.WithFaviconImage(a.ID)) } } if wc.Image != "" { a, err := s.CopyAsset(ctx, wc.Image) if err != nil { s.logger.Warn("failed to scrape web content primary image", slog.String("error", err.Error()), slog.String("url", u.String())) } else { opts = append(opts, link_writer.WithPrimaryImage(a.ID)) } } ln, err := s.lr.Store(ctx, u.String(), wc.Title, wc.Description, opts...) if err != nil { return nil, nil, fault.Wrap(err, fctx.With(ctx)) } return ln, wc, nil } func (s *Fetcher) CopyAsset(ctx context.Context, url string) (*asset.Asset, error) { resp, err := http.Get(url) if err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } if resp.StatusCode != http.StatusOK { ctx = fctx.WithMeta(ctx, "status", resp.Status) return nil, fault.Wrap(fault.New("failed to get"), fctx.With(ctx)) } // TODO: Better naming??? name := slug.Make(url) a, err := s.uploader.Upload(ctx, resp.Body, resp.ContentLength, asset.NewFilename(name), asset_upload.Options{}) if err != nil { return nil, fault.Wrap(err, fctx.With(ctx)) } return a, nil }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Southclaws/storyden'

If you have feedback or need assistance with the MCP directory API, please join our Discord server