forked from rasbt/machine-learning-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathch16-part2-gpt2.py
134 lines (50 loc) · 1.99 KB
/
ch16-part2-gpt2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# coding: utf-8
import sys
from python_environment_check import check_packages
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer
from transformers import GPT2Model
# # Machine Learning with PyTorch and Scikit-Learn
# # -- Code Examples
# ## Package version checks
# Add folder to path in order to load from the check_packages.py script:
sys.path.insert(0, '..')
# Check recommended package versions:
d = {
'torch': '1.9.0',
'transformers': '4.9.1',
}
check_packages(d)
# # Chapter 16: Transformers – Improving Natural Language Processing with Attention Mechanisms (Part 2/3)
# **Outline**
#
# - [Building large-scale language models by leveraging unlabeled data](#Building-large-scale-language-models-by-leveraging-unlabeled-data)
# - [Pre-training and fine-tuning transformer models](#Pre-training-and-fine-tuning-transformer-models)
# - [Leveraging unlabeled data with GPT](#Leveraging-unlabeled-data-with-GPT)
# - [Using GPT-2 to generate new text](#Using-GPT-2-to-generate-new-text)
# - [Bidirectional pre-training with BERT](#Bidirectional-pre-training-with-BERT)
# - [The best of both worlds: BART](#The-best-of-both-worlds-BART)
# ## Building large-scale language models by leveraging unlabeled data
# ## Pre-training and fine-tuning transformer models
#
#
# ## Leveraging unlabeled data with GPT
# ### Using GPT-2 to generate new text
generator = pipeline('text-generation', model='gpt2')
set_seed(123)
generator("Hey readers, today is",
max_length=20,
num_return_sequences=3)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = "Let us encode this sentence"
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input
model = GPT2Model.from_pretrained('gpt2')
output = model(**encoded_input)
output['last_hidden_state'].shape
# ### Bidirectional pre-training with BERT
#
# ### The best of both worlds: BART
# ---
#
# Readers may ignore the next cell.