-
Notifications
You must be signed in to change notification settings - Fork 13
/
retrieve_ptab_files.sh
executable file
·210 lines (187 loc) · 5.36 KB
/
retrieve_ptab_files.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#/bin/bash
#set -x
###################################################################################################################
#
#Script to download,extract, and parse PTAB pdf files from USPTO bulk data site.
#
#argument options:
# (1) pass -d OR --date AND YYYYMMDD to download files from a specific date on
# (2) pass -a OR --all to download all files
# (3) pass -n OR --none to skip downloading of files and execute unzipping process
#
###################################################################################################################
#create lock file
function lock ( ) {
if "$1" == true
then
if [ -f ${lockFile} ]
then
echo
echo "${lockFile} exits."
echo "Looks like a copy of $scriptName is already running... "
echo "Abort"
log "ERR" " Already running, please check logs for reason... "
exit 0
else
touch ${lockFile}
fi
else
if [ -f $lockFile ] ; then
rm $lockFile
fi
fi
}
function usage
{
echo "usage: ./retrieve_files.sh [[-d YYYMMDD YYYYMMDD]| [-a] | [-n]]"
}
function log()
{
type=$1
message=$2
if [ $type = "WARN" -o $type = "ERR" ]
then
echo >> $statusDirectory/err-$processingTime
echo -e "$type: `date +%Y.%m.%d-%H:%M:%S` -- $scriptName -- $message" >> $statusDirectory/retrieve-err-$processingTime
fi
#
# always write into log file
#
echo
echo -e "$type: `date +%Y.%m.%d-%H:%M:%S` -- $scriptName -- $message"
echo >> $statusDirectory/log-$processingTime
echo -e "$type: `date +%Y.%m.%d-%H:%M:%S` -- $scriptName -- $message" >> $statusDirectory/retrieve-log-$processingTime
}
#=============================== MAIN BODY OF SCRIPT ===============================
##### Constants
processingTime=`date +%Y%m%d-%H%M%S`
scriptName=$0
statusDirectory=logs
baseURL="https://bulkdata.uspto.gov/data2/patent/trial/appeal/board/"
dropLocation="files/PTAB"
startDate=19970702
endDate=$(date +%Y%m%d)
retrieveAll=false
retrieveNone=false
lockFile="/tmp/file_download.lck"
touch $statusDirectory/log-$processingTime
lock true
case "$1" in
-d | --date )
shift
if date "+%d/%m/%Y" -d $1 >/dev/null 2>&1
then
startDate=$1
if [ ! -z "$2" ]
then
echo $2
if date "+%d/%m/%Y" -d $2 >/dev/null 2>&1
then
endDate=$2
else
log "ERR" "end date passed in is not valid: $2"
lock false
exit 1
fi
fi
log "INFO" "Date parameters: \n\tStartDate: $startDate \n\tEndDate: $endDate"
else
log "ERR" "start date passed in is not valid: $1"
lock false
exit 1
fi
;;
-a | --all )
#retrieveAll=true
log "INFO" "Date parameters: \n\tStartDate: $startDate \n\tEndDate: $endDate"
log "INFO" "Retrieve All parameter set to TRUE"
;;
-n | --none )
retrieveNone=true
log "INFO" "Retrieve None parameter set to TRUE"
;;
-h | --help )
usage
lock false
exit
;;
* )
usage
lock false
log "ERR" "argument passed in is not valid: $1"
exit 1
esac
#create directory for downloaded files and logs(if does not exist already)
mkdir -p $dropLocation
mkdir -p $statusDirectory
log "INFO" "-[JOB START] $(date): ------------"
startDate=$(date '+%C%y%m%d' -d "$startDate -$(date -d $startDate +%u) days + 5 day")
begDate=$startDate
if ! $retrieveNone
then
log "INFO" "Starting file download process"
while [ $begDate -le $endDate ]
do
year=$(date -d $begDate +%Y)
week=$(date -d $begDate +%V)
if [ $year -eq 2015 ]
then
week="$(printf "%02d" $((10#$week-1)))"
fi
zipFilePath=${baseURL}PTAB_${begDate}_WK${week}.zip
wget -q --spider $zipFilePath
if [ $? -eq 0 ]
then
log "INFO" "Downloading file: $zipFilePath"
wget -nc -P $dropLocation $zipFilePath >> $statusDirectory/retrieve-log-$processingTime 2>&1
else
log "ERR" "file does not exist: $zipFilePath"
fi
begDate=$(date '+%C%y%m%d' -d "$begDate+7 days")
done
log "INFO" "File download process complete"
#if --none flag is set then skip download process
else
log "INFO" "skipping file download process"
fi
#unzip all zip files unless they have already been unzipped
log "INFO" "Starting file unzipping process"
find $dropLocation -type f -name "*.zip" -exec unzip -n {} -d $dropLocation \;
log "INFO" "File unzip process complete"
log "INFO" "Starting file parsing process"
#parse all pdf files that have not already been parsed
begDate=$startDate
echo $begDate
while [ $begDate -le $endDate ]
do
for f in $dropLocation/PTAB_*$begDate
do
if [[ $f == *.zip ]]
then
continue;
else
echo $f
if [ -d "$f/PDF_image" ]
then
for i in $f/PDF_image/*.pdf
do
fname=$(basename "$i")
fname="${fname%.*}"
if [ ! -f "$f/PDF_image/$fname.txt" ]
then
log "INFO" "Parsing document: $i to ${i%.*}.txt"
python parse_pdf.py "$i" >> $statusDirectory/retrieve-log-$processingTime 2>&1
# leaving this cURL command in so we can use it for reference or debugging
# curl -X PUT --data-binary @$i http://192.168.99.100:9998/tika --header "Content-type: application/pdf" > ${i%.*}.txt
fi
done
else
log "INFO" "No files to parse"
fi
fi
done
begDate=$(date '+%C%y%m%d' -d "$begDate+7 days")
done
log "INFO" "File parsing process complete"
log "INFO" "-[JOB END]-- $(date): ------------"
lock false