Skip to content

Commit

Permalink
update spacy to deal conflict with ms-swift (#397)
Browse files Browse the repository at this point in the history
* update_spacy

* fix model version

* keep model 3.5.0

* update spacy to 3.7.0 & support native tar.gz package

* update docker version

* update librosa version

* update nltk version

---------

Co-authored-by: gece.gc <[email protected]>
  • Loading branch information
BeachWang and drcege authored Aug 27, 2024
1 parent 0a0e78e commit 2689413
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 13 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: '3'
services:
ray-head:
image: data-juicer-unittest:0.2.1
image: data-juicer-unittest:0.2.2
pull_policy: never
command: ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block
environment:
Expand Down Expand Up @@ -30,7 +30,7 @@ services:
capabilities: [gpu]

ray-worker:
image: data-juicer-unittest:0.2.1
image: data-juicer-unittest:0.2.2
pull_policy: never
command: ray start --address=ray-head:6379 --block
environment:
Expand Down
37 changes: 30 additions & 7 deletions data_juicer/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def check_model(model_name, force=False):
)
else:
logger.info(
f'Model [{cached_model_path}] not found . Downloading...')
f'Model [{cached_model_path}] not found. Downloading...')

try:
model_link = os.path.join(MODEL_LINKS, model_name)
Expand Down Expand Up @@ -406,7 +406,7 @@ def prepare_huggingface_model(pretrained_model_name_or_path,
return (model, processor) if return_model else processor


def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.5.0'):
def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.7.0'):
"""
Prepare spacy model for specific language.
Expand All @@ -419,17 +419,40 @@ def prepare_spacy_model(lang, name_pattern='{}_core_web_md-3.5.0'):
assert lang in ['zh', 'en'], 'Diversity only support zh and en'
model_name = name_pattern.format(lang)
logger.info(f'Loading spacy model [{model_name}]...')
compressed_model = '{}.zip'.format(model_name)
compressed_model = '{}.tar.gz'.format(model_name)

# decompress the compressed model if it's not decompressed
def decompress_model(compressed_model_path):
decompressed_model_path = compressed_model_path.replace('.zip', '')
if not compressed_model_path.endswith('.tar.gz'):
raise ValueError('Only .tar.gz files are supported')

decompressed_model_path = compressed_model_path.replace('.tar.gz', '')
if os.path.exists(decompressed_model_path) \
and os.path.isdir(decompressed_model_path):
return decompressed_model_path
import zipfile
with zipfile.ZipFile(compressed_model_path) as zf:
zf.extractall(DJMC)

ver_name = os.path.basename(decompressed_model_path)
unver_name = ver_name.rsplit('-', maxsplit=1)[0]
target_dir_in_archive = f'{ver_name}/{unver_name}/{ver_name}/'

import tarfile
with tarfile.open(compressed_model_path, 'r:gz') as tar:
for member in tar.getmembers():
if member.name.startswith(target_dir_in_archive):
# relative path without unnecessary directory levels
relative_path = os.path.relpath(
member.name, start=target_dir_in_archive)
target_path = os.path.join(decompressed_model_path,
relative_path)

if member.isfile():
# ensure the directory exists
target_directory = os.path.dirname(target_path)
os.makedirs(target_directory, exist_ok=True)
# for files, extract to the specific location
with tar.extractfile(member) as source:
with open(target_path, 'wb') as target:
target.write(source.read())
return decompressed_model_path

try:
Expand Down
4 changes: 2 additions & 2 deletions environments/minimal_requires.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ pandas==2.0.3
datasets==2.18.0
av
soundfile
librosa
librosa>=0.10
loguru
tabulate
tqdm
Expand All @@ -21,7 +21,7 @@ pdfplumber
plotly
python-docx
streamlit
spacy==3.5.0
spacy==3.7.0
multiprocess==0.70.12
dill==0.3.4
psutil
4 changes: 2 additions & 2 deletions environments/science_requires.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ simhash-pybind
selectolax
nlpaug
nlpcda
nltk
nltk<3.9
transformers>=4.37
transformers_stream_generator
einops
accelerate
tiktoken
opencc==1.1.6
imagededup
spacy-pkuseg==0.0.32
spacy-pkuseg
diffusers
simple-aesthetics-predictor
scenedetect[opencv]
Expand Down

0 comments on commit 2689413

Please sign in to comment.