Merge pull request #97 from hynky1999/cli-update

cli-update
hynky1999 · Dec 6, 2023 · eb4dc39 · eb4dc39
2 parents 3b8c2d1 + 58688aa
commit eb4dc39
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ To create them you need an example html files you want to extract.
 You can use the following command to get html files from the CommonCrawl dataset:
 
 ```bash
-$ cmon download --match_type=domain --limit=100 html_output html example.com
+$ cmon download --match_type=domain --limit=100 html_output example.com html
 ```
 This will download a first 100 html files from example.com and save them in html_output.
 
@@ -106,7 +106,7 @@ In our case the config would look like this:
 To test the extraction, you can use the following command:
 
 ```bash
-$ cmon extract config.json extracted_output html html_output/*.html
+$ cmon extract config.json extracted_output html_output/*.html html
 ```
 
 ### Crawl the sites
@@ -117,16 +117,16 @@ To do this you will proceed in two steps:
 To do this, you can use the following command:
 
 ```bash
-cmon download --match_type=domain --limit=100000 dr_output record example.com
+cmon download --match_type=domain --limit=100 dr_output example.com record
 ```
 
-This will download the first 100000 records from example.com and save them in dr_output. By default it saves 100_000 records per file, you can change this with the --max_crawls_per_file option.
+This will download the first 100 records from example.com and save them in dr_output. By default it saves 100_000 records per file, you can change this with the --max_crawls_per_file option.
 
 #### 2. Extract the records
 Once you have the records, you can use the following command to extract them:
 
 ```bash
-$ cmon extract --n_proc=4 config.json extracted_output record dr_output/*.jsonl
+$ cmon extract --n_proc=4 config.json extracted_output dr_output/*.jsonl record
 ```
 
 Note that you can use the --n_proc option to specify the number of processes to use for the extraction. Multiprocessing is done on file level, so if you have just one file it will not be used.

diff --git a/cmoncrawl/integrations/download.py b/cmoncrawl/integrations/download.py
@@ -81,13 +81,9 @@ def add_mode_args(subparser: Any):
 def add_args(subparser: Any):
     parser = subparser.add_parser("download", help="Download data from Common Crawl")
     parser.add_argument("output", type=Path, help="Path to output directory")
-    mode_subparser = parser.add_subparsers(
-        dest="mode", required=True, help="Download mode"
-    )
     parser.add_argument(
         "urls", type=str, nargs="+", help="URLs to download, e.g. www.bcc.cz."
     )
-    mode_subparser = add_mode_args(mode_subparser)
     parser.add_argument(
         "--limit", type=int, default=5, help="Max number of urls to download"
     )
@@ -155,6 +151,10 @@ def add_args(subparser: Any):
         default=None,
         help="S3 bucket to use for Athena. If set, the query results will be stored in the bucket and reused for later queries. Make sure to delete the bucket afterwards.",
     )
+    mode_subparser = parser.add_subparsers(
+        dest="mode", required=True, help="Download mode"
+    )
+    mode_subparser = add_mode_args(mode_subparser)
     parser.set_defaults(func=run_download)
 
 
@@ -316,10 +316,15 @@ async def url_download(
 
 def run_download(args: argparse.Namespace):
     mode = DownloadOutputFormat(args.mode)
-    encoding = args.encoding if mode == DownloadOutputFormat.HTML else None
+    # Record exlusives
     max_crawls_per_file = (
         args.max_crawls_per_file if mode == DownloadOutputFormat.RECORD else 1
     )
+    # HTML exlusives
+    encoding = args.encoding if mode == DownloadOutputFormat.HTML else None
+    download_method = (
+        DAOname(args.download_method) if mode == DownloadOutputFormat.HTML else None
+    )
     return asyncio.run(
         url_download(
             urls=args.urls,
@@ -337,7 +342,7 @@ def run_download(args: argparse.Namespace):
             aggregator_type=args.aggregator,
             max_directory_size=args.max_directory_size,
             filter_non_200=args.filter_non_200,
-            download_method=args.download_method,
+            download_method=download_method,
             s3_bucket=args.s3_bucket,
         )
     )
diff --git a/cmoncrawl/integrations/extract.py b/cmoncrawl/integrations/extract.py
@@ -102,14 +102,14 @@ def add_args(subparser: Any):
         default=1,
         help="Number of processes to use for extraction. The paralelization is on file level, thus for single file it's useless to use more than one process.",
     )
+    parser.add_argument(
+        "files", nargs="+", type=Path, help="Files to extract data from"
+    )
 
     mode_subparser = parser.add_subparsers(
         dest="mode", required=True, help="Extraction mode"
     )
     mode_subparser = add_mode_args(mode_subparser)
-    parser.add_argument(
-        "files", nargs="+", type=Path, help="Files to extract data from"
-    )
     parser.set_defaults(func=run_extract)
 
 
@@ -216,24 +216,34 @@ def _extract_task(
     args: argparse.Namespace,
 ):
     mode = ExtractMode(args.mode)
-    download_method = DAOname(args.download_method) if args.download_method else None
 
     # We have to setup loggers / aws in each process
     setup_loggers(args.verbosity)
     CONFIG.update_from_cli(args)
 
+    # HTML exlusives
+    url = args.url if mode == ExtractMode.HTML else None
+    date = args.date if mode == ExtractMode.HTML else None
+
+    # Record exclusives
+    max_retry = args.max_retry if mode == ExtractMode.RECORD else 0
+    sleep_base = args.sleep_base if mode == ExtractMode.RECORD else 0
+    download_method = (
+        DAOname(args.download_method) if mode == ExtractMode.RECORD else None
+    )
+
     asyncio.run(
         extract_from_files(
             output_path=output_path,
             config=config,
             files=files,
             mode=mode,
-            url=args.url if mode == ExtractMode.HTML else None,
-            date=args.date if mode == ExtractMode.HTML else None,
+            url=url,
+            date=date,
             max_directory_size=args.max_directory_size,
             max_crawls_per_file=args.max_crawls_per_file,
-            max_retry=args.max_retry if mode == ExtractMode.RECORD else 0,
-            sleep_base=args.sleep_base if mode == ExtractMode.RECORD else 0,
+            max_retry=max_retry,
+            sleep_base=sleep_base,
             download_method=download_method,
         )
     )

diff --git a/docs/source/cli/cli.rst b/docs/source/cli/cli.rst
@@ -31,10 +31,10 @@ Examples
     cmon download --match_type=domain --limit=100 html_output html example.com
 
     # Take the domain records downloaded using the first command and extracts them using your extractors
-    cmon extract config.json extracted_output record dr_output/*.jsonl
+    cmon extract config.json extracted_output dr_output/*.jsonl record
 
     # Take the htmls downloaded using the second command and extracts them using your extractors
-    cmon extract config.json extracted_output html html_output/*.html
+    cmon extract config.json extracted_output html_output/*.html html
 
 
 

diff --git a/docs/source/cli/extract.rst b/docs/source/cli/extract.rst
@@ -77,10 +77,10 @@ Examples
 .. code-block:: bash
 
     # Take the domain records downloaded using the first command and extracts them using your extractors
-    cmon extract config.json extracted_output record --max_retry 100 --download_method=gateway --sleep_base 1.3 dr_output/*.jsonl 
+    cmon extract config.json extracted_output dr_output/*.jsonl record --max_retry 100 --download_method=gateway --sleep_base 1.3 
 
     # Take the htmls downloaded using the second command and extracts them using your extractors
-    cmon extract config.json extracted_output html --date 2021-01-01 --url https://www.example.com html_output/*.html
+    cmon extract config.json extracted_output html_output/*.html html --date 2021-01-01 --url https://www.example.com
 
 When you are going to build the extractors, you will appreciate that you can specify
 what the URL of the HTML file is and what the date of the extraction is. This is because