Skip to content

Commit

Permalink
only download models from huggingface (#888)
Browse files Browse the repository at this point in the history
  • Loading branch information
lvyufeng authored Mar 4, 2024
1 parent 013c974 commit 40d6a27
Show file tree
Hide file tree
Showing 42 changed files with 272 additions and 353 deletions.
4 changes: 2 additions & 2 deletions examples/classification/gpt_imdb_finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"source": [
"from mindnlp.transformers import GPTTokenizer\n",
"# tokenizer\n",
"gpt_tokenizer = GPTTokenizer.from_pretrained('openai-gpt', from_pt=True)\n",
"gpt_tokenizer = GPTTokenizer.from_pretrained('openai-gpt')\n",
"\n",
"# add sepcial token: <PAD>\n",
"special_tokens_dict = {\n",
Expand Down Expand Up @@ -137,7 +137,7 @@
"from mindspore.experimental.optim import Adam\n",
"\n",
"# set bert config and define parameters for training\n",
"model = GPTForSequenceClassification.from_pretrained('openai-gpt', from_pt=True, num_labels=2)\n",
"model = GPTForSequenceClassification.from_pretrained('openai-gpt', num_labels=2)\n",
"model.config.pad_token_id = gpt_tokenizer.pad_token_id\n",
"model.resize_token_embeddings(model.config.vocab_size + 3)\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion llm/finetune/graphormer/graphormer_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def main(args):
auto_load=True)

# Load model
model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2", from_pt=True)
model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2")

# Initiate the optimizer
optimizer = nn.AdamWeightDecay(model.trainable_params(),
Expand Down
4 changes: 2 additions & 2 deletions llm/inference/chatglm2/cli_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import platform
from mindnlp.transformers import ChatGLM2Tokenizer, ChatGLM2ForConditionalGeneration

tokenizer = ChatGLM2Tokenizer.from_pretrained("THUDM/chatglm2-6b", from_pt=True)
model = ChatGLM2ForConditionalGeneration.from_pretrained("THUDM/chatglm2-6b", from_pt=True)
tokenizer = ChatGLM2Tokenizer.from_pretrained("THUDM/chatglm2-6b")
model = ChatGLM2ForConditionalGeneration.from_pretrained("THUDM/chatglm2-6b")
model = model.set_train(False)

os_name = platform.system()
Expand Down
4 changes: 2 additions & 2 deletions llm/inference/chatglm3/cli_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import platform
from mindnlp.transformers import ChatGLM3Tokenizer, ChatGLM3ForConditionalGeneration

tokenizer = ChatGLM3Tokenizer.from_pretrained("THUDM/chatglm3-6b", from_pt=True)
model = ChatGLM3ForConditionalGeneration.from_pretrained("THUDM/chatglm3-6b", from_pt=True)
tokenizer = ChatGLM3Tokenizer.from_pretrained("THUDM/chatglm3-6b")
model = ChatGLM3ForConditionalGeneration.from_pretrained("THUDM/chatglm3-6b")
model = model.set_train(False)

os_name = platform.system()
Expand Down
4 changes: 2 additions & 2 deletions llm/inference/pangu/pangu_generate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("sunzeyeah/pangu-350M-sft", from_pt=True)
model = AutoModelForCausalLM.from_pretrained("sunzeyeah/pangu-350M-sft", from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("sunzeyeah/pangu-350M-sft")
model = AutoModelForCausalLM.from_pretrained("sunzeyeah/pangu-350M-sft")

prompt = "我不能确定对方是不是喜欢我,我却想分分秒秒跟他在一起,有谁能告诉我如何能想他少一点<sep>回答:"
inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors="ms")
Expand Down
2 changes: 0 additions & 2 deletions llm/inference/phi_2/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
# Load the Phi 2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
"microsoft/phi-2",
from_pt=True
)

model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
from_pt=True
)

# Streamlit UI
Expand Down
77 changes: 20 additions & 57 deletions mindnlp/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,7 @@ def from_pretrained( # pylint: disable=too-many-locals
"""from_pretrained"""
state_dict = kwargs.pop("state_dict", None)
cache_dir = kwargs.pop("cache_dir", None)
from_pt = kwargs.pop("from_pt", True)
_ = kwargs.pop("from_pt", True)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
Expand Down Expand Up @@ -839,7 +839,7 @@ def from_pretrained( # pylint: disable=too-many-locals
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
is_local = os.path.isdir(pretrained_model_name_or_path)
if is_local:
if from_pt and os.path.isfile(
if os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, PT_WEIGHTS_NAME)
):
# Load from a PyTorch checkpoint
Expand All @@ -858,7 +858,7 @@ def from_pretrained( # pylint: disable=too-many-locals
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)
)
elif from_pt and os.path.isfile(
elif os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(PT_WEIGHTS_INDEX_NAME, variant))
):
# Load from a sharded PyTorch checkpoint
Expand Down Expand Up @@ -901,11 +901,12 @@ def from_pretrained( # pylint: disable=too-many-locals
elif is_remote_url(pretrained_model_name_or_path):
filename = pretrained_model_name_or_path
resolved_archive_file = download_url(pretrained_model_name_or_path)
elif from_pt:
else:
if use_safetensors is not False:
filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
else:
filename = _add_variant(PT_WEIGHTS_NAME, variant)
filename = _add_variant(WEIGHTS_NAME, variant)

try:
# Load from URL or cache if already cached
cached_file_kwargs = {
Expand Down Expand Up @@ -935,68 +936,30 @@ def from_pretrained( # pylint: disable=too-many-locals
if resolved_archive_file is not None:
is_sharded = True
use_safetensors = True
else:
# This repo has no safetensors file of any kind, we switch to PyTorch.
filename = _add_variant(PT_WEIGHTS_NAME, variant)
resolved_archive_file = cached_file(
pretrained_model_name_or_path, filename, **cached_file_kwargs
)

if resolved_archive_file is None:
filename = _add_variant(PT_WEIGHTS_NAME, variant)
filename = _add_variant(WEIGHTS_NAME, variant)
resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)

if resolved_archive_file is None and filename == _add_variant(PT_WEIGHTS_NAME, variant):
if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
# Maybe the checkpoint is sharded, we try to grab the index name in this case.
resolved_archive_file = cached_file(
pretrained_model_name_or_path,
_add_variant(PT_WEIGHTS_INDEX_NAME, variant),
_add_variant(WEIGHTS_INDEX_NAME, variant),
**cached_file_kwargs,
)
if resolved_archive_file is not None:
is_sharded = True

if resolved_archive_file is None:
raise EnvironmentError(
f"{pretrained_model_name_or_path} does not appear to have a file named"
f" {_add_variant(SAFE_WEIGHTS_NAME, variant)}, {_add_variant(PT_WEIGHTS_NAME, variant)}"
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
# to the original exception.
raise
except Exception as exc:
# For any other exception, we throw a generic error.
raise EnvironmentError(
f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
", make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a file named {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
f" {_add_variant(PT_WEIGHTS_NAME, variant)}."
) from exc
else:
# set correct filename
filename = _add_variant(WEIGHTS_NAME, variant)
try:
# Load from URL or cache if already cached
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"subfolder": subfolder,
"_raise_exceptions_for_missing_entries": False,
'token': token
}

resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
filename = _add_variant(PT_WEIGHTS_NAME, variant)
resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)

if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
if resolved_archive_file is None and filename == _add_variant(PT_WEIGHTS_NAME, variant):
# Maybe the checkpoint is sharded, we try to grab the index name in this case.
resolved_archive_file = cached_file(
pretrained_model_name_or_path,
_add_variant(WEIGHTS_INDEX_NAME, variant),
_add_variant(PT_WEIGHTS_INDEX_NAME, variant),
**cached_file_kwargs,
)
if resolved_archive_file is not None:
Expand All @@ -1005,7 +968,7 @@ def from_pretrained( # pylint: disable=too-many-locals
if resolved_archive_file is None:
raise EnvironmentError(
f"{pretrained_model_name_or_path} does not appear to have a file named"
f" {_add_variant(WEIGHTS_NAME, variant)}."
f" {_add_variant(SAFE_WEIGHTS_NAME, variant)}, {_add_variant(PT_WEIGHTS_NAME, variant)}"
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
Expand All @@ -1017,7 +980,8 @@ def from_pretrained( # pylint: disable=too-many-locals
f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
", make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}."
f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
f" {_add_variant(PT_WEIGHTS_NAME, variant)}."
) from exc

if is_local:
Expand Down Expand Up @@ -1091,8 +1055,8 @@ def empty_initializer(init, shape=None, dtype=mindspore.float32):
# These are all the pointers of shared tensors.
tied_params = [names for _, names in ptrs.items() if len(names) > 1]

def load_ckpt(resolved_archive_file, from_pt=False):
if from_pt and 'ckpt' not in resolved_archive_file:
def load_ckpt(resolved_archive_file):
if 'ckpt' not in resolved_archive_file:
if use_safetensors:
from safetensors.numpy import load_file
origin_state_dict = load_file(resolved_archive_file)
Expand Down Expand Up @@ -1214,14 +1178,14 @@ def load_param_into_net(model: nn.Cell, param_dict: dict, prefix: str):
if is_sharded:
all_keys_unexpected = []
for name in tqdm(converted_filenames, desc="Loading checkpoint shards"):
state_dict = load_ckpt(name, from_pt)
state_dict = load_ckpt(name)
keys_unexpected, keys_missing = load_param_into_net(model, state_dict, cls.base_model_prefix)
all_keys_unexpected.extend(keys_unexpected)
del state_dict
gc.collect()
loaded_keys = sharded_metadata["all_checkpoint_keys"]
else:
state_dict = load_ckpt(resolved_archive_file, from_pt)
state_dict = load_ckpt(resolved_archive_file)
loaded_keys = list(state_dict.keys())
all_keys_unexpected, keys_missing = load_param_into_net(model, state_dict, cls.base_model_prefix)
else:
Expand Down Expand Up @@ -1266,7 +1230,6 @@ def load_param_into_net(model: nn.Cell, param_dict: dict, prefix: str):
# Set model in evaluation mode to deactivate DropOut modules by default
model.set_train(False)

kwargs['from_pt'] = from_pt
# If it is a model with generation capabilities, attempt to load the generation config
if model.can_generate() and pretrained_model_name_or_path is not None:
try:
Expand Down
3 changes: 1 addition & 2 deletions mindnlp/transformers/models/auto/auto_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def from_config(cls, config, **kwargs):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
config = kwargs.pop("config", None)
from_pt = kwargs.get('from_pt', True)
_ = kwargs.get('from_pt', True)
token = kwargs.get('token', None)
if not isinstance(config, PretrainedConfig):
kwargs_orig = copy.deepcopy(kwargs)
Expand All @@ -92,7 +92,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
if kwargs_orig.get("quantization_config", None) is not None:
kwargs["quantization_config"] = kwargs_orig["quantization_config"]

kwargs['from_pt'] = from_pt
kwargs['token'] = token
if type(config) in cls._model_mapping.keys():
model_class = _get_model_class(config, cls._model_mapping)
Expand Down
4 changes: 1 addition & 3 deletions mindnlp/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from collections import OrderedDict
from typing import Dict, Optional, Union

from mindnlp.configs import MS_URL_BASE, HF_URL_BASE
from mindnlp.utils import cached_file, is_sentencepiece_available, is_tokenizers_available, logging
from ...configuration_utils import PretrainedConfig, EncoderDecoderConfig
from ...tokenization_utils import PreTrainedTokenizer # pylint: disable=cyclic-import
Expand Down Expand Up @@ -553,8 +552,7 @@ def get_tokenizer_config(
tokenizer_config = get_tokenizer_config("tokenizer-test")
```"""

from_pt = kwargs.get('from_pt', False)
endpoint = HF_URL_BASE if from_pt else MS_URL_BASE
_ = kwargs.get('from_pt', False)
resolved_config_file = cached_file(
pretrained_model_name_or_path,
TOKENIZER_CONFIG_FILE,
Expand Down
2 changes: 0 additions & 2 deletions mindnlp/transformers/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,6 @@ def load_model(
all_traceback = {}
for model_class in class_tuple:
kwargs = model_kwargs.copy()
if model.endswith(".bin") or model.endswith(".safetensors") or model.endswith(".pth"):
kwargs["from_pt"] = True
try:
model = model_class.from_pretrained(model, **kwargs)
model = model.set_train(False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_config_model_type_from_local_file(self):
self.assertIsInstance(config, RobertaConfig)

def test_config_model_type_from_model_identifier(self):
config = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, from_pt=True)
config = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
self.assertIsInstance(config, RobertaConfig)

def test_config_for_model_str(self):
Expand Down
6 changes: 3 additions & 3 deletions tests/ut/transformers/models/auto/test_modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,13 @@ def test_token_classification_model_from_pretrained(self):


def test_from_pretrained_identifier(self):
model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_pt=True)
model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(model, BertForMaskedLM)
self.assertEqual(model.num_parameters(), 14410)
self.assertEqual(model.num_parameters(only_trainable=True), 14410)

def test_from_identifier_from_model_type(self):
model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, from_pt=True)
model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
self.assertIsInstance(model, RobertaForMaskedLM)
self.assertEqual(model.num_parameters(), 14410)
self.assertEqual(model.num_parameters(only_trainable=True), 14410)
Expand Down Expand Up @@ -321,7 +321,7 @@ def test_model_file_not_found(self):
with self.assertRaises(
EnvironmentError,
):
_ = AutoModel.from_pretrained("hf-internal-testing/config-no-model", from_pt=True)
_ = AutoModel.from_pretrained("hf-internal-testing/config-no-model")

# def test_cached_model_has_minimum_calls_to_head(self):
# # Make sure we have cached the model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ class AutoformerModelIntegrationTests(unittest.TestCase):
@unittest.skip('Mindspore cannot load torch .pt file.')
def test_inference_no_head(self):
model = AutoformerModel.from_pretrained(
"huggingface/autoformer-tourism-monthly",from_pt=True)
"huggingface/autoformer-tourism-monthly")
batch = prepare_batch()

output = model(
Expand All @@ -446,7 +446,7 @@ def test_inference_no_head(self):
@unittest.skip('Mindspore cannot load torch .pt file.')
def test_inference_head(self):
model = AutoformerForPrediction.from_pretrained(
"huggingface/autoformer-tourism-monthly", from_pt=True)
"huggingface/autoformer-tourism-monthly")
batch = prepare_batch("val-batch.pt")
output = model(
past_values=batch["past_values"],
Expand All @@ -466,7 +466,7 @@ def test_inference_head(self):
@unittest.skip('Mindspore cannot load torch .pt file.')
def test_seq_to_seq_generation(self):
model = AutoformerForPrediction.from_pretrained(
"huggingface/autoformer-tourism-monthly", from_pt=True)
"huggingface/autoformer-tourism-monthly")
batch = prepare_batch("val-batch.pt")
outputs = model.generate(
static_categorical_features=batch["static_categorical_features"],
Expand Down
Loading

0 comments on commit 40d6a27

Please sign in to comment.