mcp_server_webcrawl.crawlers.archivebox.html•34.1 kB
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>mcp_server_webcrawl.crawlers.archivebox package — mcp-server-webcrawl documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" />
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="mcp_server_webcrawl.crawlers.httrack package" href="mcp_server_webcrawl.crawlers.httrack.html" />
<link rel="prev" title="mcp_server_webcrawl.crawlers.base package" href="mcp_server_webcrawl.crawlers.base.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
mcp-server-webcrawl
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="guides.html">Setup Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="usage.html">Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="prompts.html">Prompt Routines</a></li>
<li class="toctree-l1"><a class="reference internal" href="interactive.html">Interactive Mode</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">mcp_server_webcrawl</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">mcp-server-webcrawl</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">mcp_server_webcrawl</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.html">mcp_server_webcrawl package</a></li>
<li class="breadcrumb-item"><a href="mcp_server_webcrawl.crawlers.html">mcp_server_webcrawl.crawlers package</a></li>
<li class="breadcrumb-item active">mcp_server_webcrawl.crawlers.archivebox package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/mcp_server_webcrawl.crawlers.archivebox.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="mcp-server-webcrawl-crawlers-archivebox-package">
<h1>mcp_server_webcrawl.crawlers.archivebox package<a class="headerlink" href="#mcp-server-webcrawl-crawlers-archivebox-package" title="Link to this heading"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="module-mcp_server_webcrawl.crawlers.archivebox.adapter">
<span id="mcp-server-webcrawl-crawlers-archivebox-adapter-module"></span><h2>mcp_server_webcrawl.crawlers.archivebox.adapter module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.archivebox.adapter" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.adapter.ArchiveBoxManager">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">ArchiveBoxManager</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/adapter.html#ArchiveBoxManager"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.adapter.ArchiveBoxManager" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.indexed.IndexedManager" title="mcp_server_webcrawl.crawlers.base.indexed.IndexedManager"><code class="xref py py-class docutils literal notranslate"><span class="pre">IndexedManager</span></code></a></p>
<p>Manages ArchiveBox in-memory SQLite databases for session-level reuse.</p>
<p>Initialize the ArchiveBox manager with empty cache and statistics.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.adapter.ArchiveBoxManager.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/adapter.html#ArchiveBoxManager.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.adapter.ArchiveBoxManager.__init__" title="Link to this definition"></a></dt>
<dd><p>Initialize the ArchiveBox manager with empty cache and statistics.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>None</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.adapter.get_sites">
<span class="sig-name descname"><span class="pre">get_sites</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/adapter.html#get_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.adapter.get_sites" title="Link to this definition"></a></dt>
<dd><p>List ArchiveBox instances as separate sites.
Each subdirectory of datasrc that contains an “archive” folder is treated as a separate ArchiveBox instance.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.13)"><em>Path</em></a>) – path to the directory containing ArchiveBox instance directories</p></li>
<li><p><strong>ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
<li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – optional list of fields to include in the response</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>List of SiteResult objects, one for each ArchiveBox instance</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.sites.SiteResult" title="mcp_server_webcrawl.models.sites.SiteResult"><em>SiteResult</em></a>]</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.adapter.get_resources">
<span class="sig-name descname"><span class="pre">get_resources</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sites</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">limit</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/adapter.html#get_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.adapter.get_resources" title="Link to this definition"></a></dt>
<dd><p>Get resources from ArchiveBox instances using in-memory SQLite.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.13)"><em>Path</em></a>) – path to the directory containing ArchiveBox instance directories</p></li>
<li><p><strong>sites</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>] </em><em>| </em><em>None</em>) – optional list of site IDs to filter by</p></li>
<li><p><strong>query</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – search query string</p></li>
<li><p><strong>fields</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>] </em><em>| </em><em>None</em>) – optional list of fields to include in response</p></li>
<li><p><strong>sort</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> | </em><em>None</em>) – sort order for results</p></li>
<li><p><strong>limit</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – maximum number of results to return</p></li>
<li><p><strong>offset</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – number of results to skip for pagination</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>Tuple of (list of ResourceResult objects, total count, IndexState)</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)">tuple</a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)">list</a>[<a class="reference internal" href="mcp_server_webcrawl.models.html#mcp_server_webcrawl.models.resources.ResourceResult" title="mcp_server_webcrawl.models.resources.ResourceResult"><em>ResourceResult</em></a>], <a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a>, <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.adapter.IndexState" title="mcp_server_webcrawl.crawlers.base.adapter.IndexState"><em>IndexState</em></a>]</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.archivebox.crawler">
<span id="mcp-server-webcrawl-crawlers-archivebox-crawler-module"></span><h2>mcp_server_webcrawl.crawlers.archivebox.crawler module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.archivebox.crawler" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">ArchiveBoxCrawler</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/crawler.html#ArchiveBoxCrawler"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler" title="mcp_server_webcrawl.crawlers.base.indexed.IndexedCrawler"><code class="xref py py-class docutils literal notranslate"><span class="pre">IndexedCrawler</span></code></a></p>
<p>A crawler implementation for ArchiveBox archived sites.
Provides functionality for accessing and searching web content from ArchiveBox archives.
ArchiveBox creates single-URL archives with metadata stored in JSON files
and HTML content preserved in index.html files.</p>
<p>Initialize the ArchiveBox crawler with a data source directory.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>datasrc</strong> – The input argument as Path, it must be a directory containing
ArchiveBox archive directories, each containing individual URL entries</p>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#AssertionError" title="(in Python v3.13)"><strong>AssertionError</strong></a> – If datasrc is None or not a directory</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">datasrc</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/crawler.html#ArchiveBoxCrawler.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.crawler.ArchiveBoxCrawler.__init__" title="Link to this definition"></a></dt>
<dd><p>Initialize the ArchiveBox crawler with a data source directory.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>datasrc</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.13)"><em>Path</em></a>) – The input argument as Path, it must be a directory containing
ArchiveBox archive directories, each containing individual URL entries</p>
</dd>
<dt class="field-even">Raises<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#AssertionError" title="(in Python v3.13)"><strong>AssertionError</strong></a> – If datasrc is None or not a directory</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.archivebox.tests">
<span id="mcp-server-webcrawl-crawlers-archivebox-tests-module"></span><h2>mcp_server_webcrawl.crawlers.archivebox.tests module<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.archivebox.tests" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">ArchiveBoxTests</span></span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests" title="Link to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="mcp_server_webcrawl.crawlers.base.html#mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests" title="mcp_server_webcrawl.crawlers.base.tests.BaseCrawlerTests"><code class="xref py py-class docutils literal notranslate"><span class="pre">BaseCrawlerTests</span></code></a></p>
<p>Test suite for the ArchiveBox crawler implementation.
Uses wrapped test methods from BaseCrawlerTests adapted for ArchiveBox’s multi-instance structure.</p>
<p>Create an instance of the class that will use the named test
method when executed. Raises a ValueError if the instance does
not have a method with the specified name.</p>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.setUp">
<span class="sig-name descname"><span class="pre">setUp</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.setUp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.setUp" title="Link to this definition"></a></dt>
<dd><p>Set up the test environment with fixture data.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_pulse">
<span class="sig-name descname"><span class="pre">test_archivebox_pulse</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_pulse"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_pulse" title="Link to this definition"></a></dt>
<dd><p>Test basic crawler initialization.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_sites">
<span class="sig-name descname"><span class="pre">test_archivebox_sites</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_sites"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_sites" title="Link to this definition"></a></dt>
<dd><p>Test site retrieval API functionality.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_search">
<span class="sig-name descname"><span class="pre">test_archivebox_search</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_search"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_search" title="Link to this definition"></a></dt>
<dd><p>Test boolean search functionality.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_pragmar_tokenizer">
<span class="sig-name descname"><span class="pre">test_pragmar_tokenizer</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_pragmar_tokenizer"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_pragmar_tokenizer" title="Link to this definition"></a></dt>
<dd><p>Test tokenizer search functionality.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_resources">
<span class="sig-name descname"><span class="pre">test_archivebox_resources</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_resources"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_resources" title="Link to this definition"></a></dt>
<dd><p>Test resource retrieval API functionality with various parameters.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_interrobot_images">
<span class="sig-name descname"><span class="pre">test_interrobot_images</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_interrobot_images"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_interrobot_images" title="Link to this definition"></a></dt>
<dd><p>Test InterroBot-specific image handling and thumbnails.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_sorts">
<span class="sig-name descname"><span class="pre">test_archivebox_sorts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_sorts"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_sorts" title="Link to this definition"></a></dt>
<dd><p>Test random sort functionality using the ‘?’ sort parameter.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_content_parsing">
<span class="sig-name descname"><span class="pre">test_archivebox_content_parsing</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_content_parsing"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_content_parsing" title="Link to this definition"></a></dt>
<dd><p>Test content type detection and parsing for ArchiveBox resources.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_url_reconstruction">
<span class="sig-name descname"><span class="pre">test_archivebox_url_reconstruction</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_url_reconstruction"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_url_reconstruction" title="Link to this definition"></a></dt>
<dd><p>Test URL reconstruction from ArchiveBox metadata.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_deduplication">
<span class="sig-name descname"><span class="pre">test_archivebox_deduplication</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_deduplication"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_deduplication" title="Link to this definition"></a></dt>
<dd><p>Test resource deduplication across timestamped entries.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_metadata_parsing">
<span class="sig-name descname"><span class="pre">test_archivebox_metadata_parsing</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_metadata_parsing"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_metadata_parsing" title="Link to this definition"></a></dt>
<dd><p>Test JSON metadata parsing from ArchiveBox files.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_timestamped_structure">
<span class="sig-name descname"><span class="pre">test_archivebox_timestamped_structure</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_timestamped_structure"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_timestamped_structure" title="Link to this definition"></a></dt>
<dd><p>Test handling of ArchiveBox’s timestamped entry structure.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_error_resilience">
<span class="sig-name descname"><span class="pre">test_archivebox_error_resilience</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_error_resilience"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_error_resilience" title="Link to this definition"></a></dt>
<dd><p>Test resilience to malformed JSON and missing files.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_multi_site">
<span class="sig-name descname"><span class="pre">test_archivebox_multi_site</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_archivebox_multi_site"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_archivebox_multi_site" title="Link to this definition"></a></dt>
<dd><p>Test that multiple ArchiveBox working directories are treated as separate sites.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_report">
<span class="sig-name descname"><span class="pre">test_report</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/mcp_server_webcrawl/crawlers/archivebox/tests.html#ArchiveBoxTests.test_report"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mcp_server_webcrawl.crawlers.archivebox.tests.ArchiveBoxTests.test_report" title="Link to this definition"></a></dt>
<dd><p>Run test report for ArchiveBox archive.</p>
</dd></dl>
</dd></dl>
</section>
<section id="module-mcp_server_webcrawl.crawlers.archivebox">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-mcp_server_webcrawl.crawlers.archivebox" title="Link to this heading"></a></h2>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="mcp_server_webcrawl.crawlers.base.html" class="btn btn-neutral float-left" title="mcp_server_webcrawl.crawlers.base package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="mcp_server_webcrawl.crawlers.httrack.html" class="btn btn-neutral float-right" title="mcp_server_webcrawl.crawlers.httrack package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>© Copyright 2025, pragmar.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>