Skip to content

Commit

Permalink
Merge pull request #97 from hynky1999/cli-update
Browse files Browse the repository at this point in the history
cli-update
  • Loading branch information
hynky1999 authored Dec 6, 2023
2 parents 3b8c2d1 + 58688aa commit eb4dc39
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 23 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ To create them you need an example html files you want to extract.
You can use the following command to get html files from the CommonCrawl dataset:

```bash
$ cmon download --match_type=domain --limit=100 html_output html example.com
$ cmon download --match_type=domain --limit=100 html_output example.com html
```
This will download a first 100 html files from example.com and save them in html_output.

Expand Down Expand Up @@ -106,7 +106,7 @@ In our case the config would look like this:
To test the extraction, you can use the following command:

```bash
$ cmon extract config.json extracted_output html html_output/*.html
$ cmon extract config.json extracted_output html_output/*.html html
```

### Crawl the sites
Expand All @@ -117,16 +117,16 @@ To do this you will proceed in two steps:
To do this, you can use the following command:

```bash
cmon download --match_type=domain --limit=100000 dr_output record example.com
cmon download --match_type=domain --limit=100 dr_output example.com record
```

This will download the first 100000 records from example.com and save them in dr_output. By default it saves 100_000 records per file, you can change this with the --max_crawls_per_file option.
This will download the first 100 records from example.com and save them in dr_output. By default it saves 100_000 records per file, you can change this with the --max_crawls_per_file option.

#### 2. Extract the records
Once you have the records, you can use the following command to extract them:

```bash
$ cmon extract --n_proc=4 config.json extracted_output record dr_output/*.jsonl
$ cmon extract --n_proc=4 config.json extracted_output dr_output/*.jsonl record
```

Note that you can use the --n_proc option to specify the number of processes to use for the extraction. Multiprocessing is done on file level, so if you have just one file it will not be used.
Expand Down
17 changes: 11 additions & 6 deletions cmoncrawl/integrations/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,9 @@ def add_mode_args(subparser: Any):
def add_args(subparser: Any):
parser = subparser.add_parser("download", help="Download data from Common Crawl")
parser.add_argument("output", type=Path, help="Path to output directory")
mode_subparser = parser.add_subparsers(
dest="mode", required=True, help="Download mode"
)
parser.add_argument(
"urls", type=str, nargs="+", help="URLs to download, e.g. www.bcc.cz."
)
mode_subparser = add_mode_args(mode_subparser)
parser.add_argument(
"--limit", type=int, default=5, help="Max number of urls to download"
)
Expand Down Expand Up @@ -155,6 +151,10 @@ def add_args(subparser: Any):
default=None,
help="S3 bucket to use for Athena. If set, the query results will be stored in the bucket and reused for later queries. Make sure to delete the bucket afterwards.",
)
mode_subparser = parser.add_subparsers(
dest="mode", required=True, help="Download mode"
)
mode_subparser = add_mode_args(mode_subparser)
parser.set_defaults(func=run_download)


Expand Down Expand Up @@ -316,10 +316,15 @@ async def url_download(

def run_download(args: argparse.Namespace):
mode = DownloadOutputFormat(args.mode)
encoding = args.encoding if mode == DownloadOutputFormat.HTML else None
# Record exlusives
max_crawls_per_file = (
args.max_crawls_per_file if mode == DownloadOutputFormat.RECORD else 1
)
# HTML exlusives
encoding = args.encoding if mode == DownloadOutputFormat.HTML else None
download_method = (
DAOname(args.download_method) if mode == DownloadOutputFormat.HTML else None
)
return asyncio.run(
url_download(
urls=args.urls,
Expand All @@ -337,7 +342,7 @@ def run_download(args: argparse.Namespace):
aggregator_type=args.aggregator,
max_directory_size=args.max_directory_size,
filter_non_200=args.filter_non_200,
download_method=args.download_method,
download_method=download_method,
s3_bucket=args.s3_bucket,
)
)
26 changes: 18 additions & 8 deletions cmoncrawl/integrations/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ def add_args(subparser: Any):
default=1,
help="Number of processes to use for extraction. The paralelization is on file level, thus for single file it's useless to use more than one process.",
)
parser.add_argument(
"files", nargs="+", type=Path, help="Files to extract data from"
)

mode_subparser = parser.add_subparsers(
dest="mode", required=True, help="Extraction mode"
)
mode_subparser = add_mode_args(mode_subparser)
parser.add_argument(
"files", nargs="+", type=Path, help="Files to extract data from"
)
parser.set_defaults(func=run_extract)


Expand Down Expand Up @@ -216,24 +216,34 @@ def _extract_task(
args: argparse.Namespace,
):
mode = ExtractMode(args.mode)
download_method = DAOname(args.download_method) if args.download_method else None

# We have to setup loggers / aws in each process
setup_loggers(args.verbosity)
CONFIG.update_from_cli(args)

# HTML exlusives
url = args.url if mode == ExtractMode.HTML else None
date = args.date if mode == ExtractMode.HTML else None

# Record exclusives
max_retry = args.max_retry if mode == ExtractMode.RECORD else 0
sleep_base = args.sleep_base if mode == ExtractMode.RECORD else 0
download_method = (
DAOname(args.download_method) if mode == ExtractMode.RECORD else None
)

asyncio.run(
extract_from_files(
output_path=output_path,
config=config,
files=files,
mode=mode,
url=args.url if mode == ExtractMode.HTML else None,
date=args.date if mode == ExtractMode.HTML else None,
url=url,
date=date,
max_directory_size=args.max_directory_size,
max_crawls_per_file=args.max_crawls_per_file,
max_retry=args.max_retry if mode == ExtractMode.RECORD else 0,
sleep_base=args.sleep_base if mode == ExtractMode.RECORD else 0,
max_retry=max_retry,
sleep_base=sleep_base,
download_method=download_method,
)
)
Expand Down
4 changes: 2 additions & 2 deletions docs/source/cli/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ Examples
cmon download --match_type=domain --limit=100 html_output html example.com
# Take the domain records downloaded using the first command and extracts them using your extractors
cmon extract config.json extracted_output record dr_output/*.jsonl
cmon extract config.json extracted_output dr_output/*.jsonl record
# Take the htmls downloaded using the second command and extracts them using your extractors
cmon extract config.json extracted_output html html_output/*.html
cmon extract config.json extracted_output html_output/*.html html
Expand Down
4 changes: 2 additions & 2 deletions docs/source/cli/extract.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ Examples
.. code-block:: bash
# Take the domain records downloaded using the first command and extracts them using your extractors
cmon extract config.json extracted_output record --max_retry 100 --download_method=gateway --sleep_base 1.3 dr_output/*.jsonl
cmon extract config.json extracted_output dr_output/*.jsonl record --max_retry 100 --download_method=gateway --sleep_base 1.3
# Take the htmls downloaded using the second command and extracts them using your extractors
cmon extract config.json extracted_output html --date 2021-01-01 --url https://www.example.com html_output/*.html
cmon extract config.json extracted_output html_output/*.html html --date 2021-01-01 --url https://www.example.com
When you are going to build the extractors, you will appreciate that you can specify
what the URL of the HTML file is and what the date of the extraction is. This is because
Expand Down

0 comments on commit eb4dc39

Please sign in to comment.