.env.sample
.gitignore
.pre-commit-config.yaml
LICENSE
Makefile
README.md
pyproject.toml
pyrightconfig.json
requirements.dev.txt
requirements.test.txt
requirements.txt
.github/workflows/release.yml
.github/workflows/sphinx_build.yml
.github/workflows/test_and_types.yml
.vscode/launch.json
.vscode/settings.json
CmonCrawl.egg-info/PKG-INFO
CmonCrawl.egg-info/SOURCES.txt
CmonCrawl.egg-info/dependency_links.txt
CmonCrawl.egg-info/entry_points.txt
CmonCrawl.egg-info/requires.txt
CmonCrawl.egg-info/top_level.txt
cmoncrawl/__init__.py
cmoncrawl/config.py
cmoncrawl/aggregator/__init__.py
cmoncrawl/aggregator/athena_query.py
cmoncrawl/aggregator/base.py
cmoncrawl/aggregator/gateway_query.py
cmoncrawl/aggregator/.vscode/settings.json
cmoncrawl/aggregator/utils/__init__.py
cmoncrawl/aggregator/utils/athena_query_maker.py
cmoncrawl/aggregator/utils/constants.py
cmoncrawl/aggregator/utils/helpers.py
cmoncrawl/aggregator/utils/ndjson.py
cmoncrawl/common/__init__.py
cmoncrawl/common/loggers.py
cmoncrawl/common/throttling.py
cmoncrawl/common/types.py
cmoncrawl/integrations/__init__.py
cmoncrawl/integrations/commands.py
cmoncrawl/integrations/download.py
cmoncrawl/integrations/extract.py
cmoncrawl/integrations/utils.py
cmoncrawl/middleware/__init__.py
cmoncrawl/middleware/stompware.py
cmoncrawl/middleware/synchronized.py
cmoncrawl/processor/__init__.py
cmoncrawl/processor/dao/__init__.py
cmoncrawl/processor/dao/api.py
cmoncrawl/processor/dao/base.py
cmoncrawl/processor/dao/s3.py
cmoncrawl/processor/extraction/__init__.py
cmoncrawl/processor/extraction/filters.py
cmoncrawl/processor/extraction/utils.py
cmoncrawl/processor/pipeline/__init__.py
cmoncrawl/processor/pipeline/downloader.py
cmoncrawl/processor/pipeline/extractor.py
cmoncrawl/processor/pipeline/pipeline.py
cmoncrawl/processor/pipeline/router.py
cmoncrawl/processor/pipeline/streamer.py
docs/Makefile
docs/make.bat
docs/source/api.rst
docs/source/conf.py
docs/source/index.rst
docs/source/usage.rst
docs/source/cli/cli.rst
docs/source/cli/download.rst
docs/source/cli/extract.rst
docs/source/cli/index.rst
docs/source/extraction/config_file.rst
docs/source/extraction/creating_extractor.rst
docs/source/extraction/index.rst
docs/source/extraction/utils.rst
docs/source/generated/cmoncrawl.aggregator.athena_query.rst
docs/source/generated/cmoncrawl.aggregator.base.rst
docs/source/generated/cmoncrawl.aggregator.gateway_query.rst
docs/source/generated/cmoncrawl.aggregator.rst
docs/source/generated/cmoncrawl.aggregator.utils.athena_query_maker.rst
docs/source/generated/cmoncrawl.aggregator.utils.helpers.rst
docs/source/generated/cmoncrawl.aggregator.utils.ndjson.rst
docs/source/generated/cmoncrawl.aggregator.utils.rst
docs/source/generated/cmoncrawl.common.loggers.rst
docs/source/generated/cmoncrawl.common.rst
docs/source/generated/cmoncrawl.common.throttling.rst
docs/source/generated/cmoncrawl.common.types.rst
docs/source/generated/cmoncrawl.config.rst
docs/source/generated/cmoncrawl.integrations.commands.rst
docs/source/generated/cmoncrawl.integrations.download.rst
docs/source/generated/cmoncrawl.integrations.extract.rst
docs/source/generated/cmoncrawl.integrations.rst
docs/source/generated/cmoncrawl.integrations.utils.rst
docs/source/generated/cmoncrawl.middleware.rst
docs/source/generated/cmoncrawl.middleware.stompware.rst
docs/source/generated/cmoncrawl.middleware.synchronized.rst
docs/source/generated/cmoncrawl.processor.dao.api.rst
docs/source/generated/cmoncrawl.processor.dao.base.rst
docs/source/generated/cmoncrawl.processor.dao.rst
docs/source/generated/cmoncrawl.processor.dao.s3.rst
docs/source/generated/cmoncrawl.processor.extraction.filters.rst
docs/source/generated/cmoncrawl.processor.extraction.rst
docs/source/generated/cmoncrawl.processor.extraction.utils.rst
docs/source/generated/cmoncrawl.processor.pipeline.downloader.rst
docs/source/generated/cmoncrawl.processor.pipeline.extractor.rst
docs/source/generated/cmoncrawl.processor.pipeline.pipeline.rst
docs/source/generated/cmoncrawl.processor.pipeline.router.rst
docs/source/generated/cmoncrawl.processor.pipeline.rst
docs/source/generated/cmoncrawl.processor.pipeline.streamer.rst
docs/source/generated/cmoncrawl.processor.rst
docs/source/generated/cmoncrawl.rst
docs/source/images/domain_record.drawio.pdf
docs/source/images/when_to_use.drawio.png
docs/source/misc/athena.rst
docs/source/misc/domain_record.rst
docs/source/misc/index.rst
docs/source/prog_guide/index.rst
docs/source/prog_guide/overview.rst
docs/source/prog_guide/practice.rst
examples/code-usage/offline-warc-iteration.py
examples/extractor_tutorial/config.json
examples/extractor_tutorial/Extractors/bbc_extractor.py
examples/extractor_tutorial/Extractors/idnes_extractor.py
extractors/my_extractor.py
tests/__init__.py
tests/athena_test.py
tests/end_to_end_test.py
tests/gateway_test.py
tests/helpers_test.py
tests/processor_test.py
tests/utils.py
tests/files/mini.warc.gz
tests/test_extract/cfg.json
tests/test_extract/extractors/test_extract.py
tests/test_extract/files/file.html
tests/test_extract/files/file.jsonl
tests/test_routes/a.py
tests/test_routes/b.py