togethercomputer · feifeibear · May 8, 2023
diff --git a/data_prep/cc/cc_net/Makefile b/data_prep/cc/cc_net/Makefile
@@ -1,6 +1,7 @@
 # Makefile to install CC-Net and train the LMs.
 # `make` or `make help` to get some help.
 
+
 # Arguments:
 lang?=en
 process?=8
@@ -58,6 +59,7 @@ dl_lm:
 lm: data/lm_sp/$(lang).sp.model data/lm_sp/$(lang).arpa.bin
 	# Computes a 5-gram LM for the given language -> make lang=it lm
 	# Restricted to the first NDOC_FOR_LM documents
+	mkdir -p data/lm_sp
 
 sp: data/lm_sp/$(lang).sp.model
 	# Train a sentence piece model on Wikipedia -> make lang=it sp
@@ -111,20 +113,21 @@ data/lm_sp/%.sp.model: data/cirrus/txt/%.opening.txt
 	echo "Trained SentencePiece model with `wc -l $(basename $@).vocab` pieces"
 
 data/cirrus/sp/%.opening.txt: data/cirrus/gz/%.json.gz data/lm_sp/%.sp.model
+	mkdir -p data/cirrus/sp
 	$(SPM_ENCODE) \
 		--model=$(word 2,$^) \
 		--output_format=piece \
-			< <(python get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \
+			< <(python cc_net/get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \
 			> $@
 
 data/cirrus/txt/%.opening.txt: data/cirrus/gz/%.json.gz
-	python get_wiki_cirrus.py opening \
+	python cc_net/get_wiki_cirrus.py opening \
 		--n_docs $(NDOC_FOR_LM) \
 		--file $< --output $@
 
 data/cirrus/gz/%.json.gz:
-	mkdir $(@D)
-	python get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D)
+	mkdir -p $(@D)
+	python cc_net/get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D)
 
 clean:
 	# Remove intemediary files, dataset, third_party sources
@@ -155,11 +158,8 @@ bin/lmplz: third_party/kenlm
 third_party/sentencepiece:
 	# Download sentencepiece sources: https://github.com/google/sentencepiece
 	mkdir -p $(@D)
-	wget -c -O $(@D)/sentencepiece.zip https://github.com/google/sentencepiece/archive/v0.1.83.zip
-	unzip -o -d $(@D) $(@D)/sentencepiece.zip
-	rm $(@D)/sentencepiece.zip
-	# remove the version id from the folder name
-	mv $(@D)/sentencepiece-* $@
+	git clone https://github.com/google/sentencepiece.git $(@D)/sentencepiece
+
 
 bin/spm_train: third_party/sentencepiece
 	# Compiles sentencepiece binaries
@@ -172,7 +172,10 @@ bin/spm_train: third_party/sentencepiece
 	# $ cd $</build
 	# $ sudo make install
 	# $ sudo ldconfig -v
-
+	# if using MAC OS
+	# ifeq ($(shell uname -s),Darwin)
+	# 	sudo update_dyld_shared_cache
+
 test:
 	python -m cc_net mine --config test
 	mkdir -p test_data/mini

diff --git a/data_prep/cc/cc_net/cc_net/get_wiki_cirrus.py b/data_prep/cc/cc_net/cc_net/get_wiki_cirrus.py
@@ -14,7 +14,7 @@
 import urllib.request
 from pathlib import Path
 from typing import Dict
-
+import os
 import func_argparse
 from bs4 import BeautifulSoup  # type: ignore
 
@@ -38,8 +38,8 @@ def opening(file: Path, output: Path = None, n_docs: int = 1_000_000):
         - tokenize: whether to tokenize the text
         - lang: Language code used to chose the tokenizer
     """
-    assert file.exists()
-    return jsonql.run_pipes(
+    assert file.exists(), f"{file} does not exist"
+    jsonql.run_pipes(
         functools.partial(extract_opening_text, n_docs=n_docs),
         file=file,
         output=tmp(output) if output else None,
@@ -116,8 +116,12 @@ def get_cirrus_urls(date: str = None) -> Dict[str, str]:
 
 
 def wget(url: str, output: Path):
-    subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True)
-    tmp(output).replace(output)
+    if not os.path.isfile(output):
+        subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True)
+        tmp(output).replace(output)
+    else:
+        print(f"File {tmp(output)} already exists, skipping download")
+
     assert (
         output.stat().st_size > 10_000
     ), f"File {output} downloaded from {url} looks too small"