Skip to content

Commit

Permalink
Merge pull request #26 from tylerjthomas9/development
Browse files Browse the repository at this point in the history
Swap IO to do blocks, fix downloads when no filings are present
  • Loading branch information
tylerjthomas9 authored Dec 17, 2024
2 parents 6a3e5fa + 2898053 commit f058add
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 70 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
name = "ScrapeSEC"
uuid = "856806e7-be2f-4540-8165-3a51303b7af0"
authors = ["tylerjthomas9 <[email protected]>"]
version = "1.0.1"
version = "1.1.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Term = "22787eb5-b846-44ae-b979-8e399b8463ab"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"

[compat]
Expand All @@ -17,7 +17,7 @@ CSV = "0.10"
DataFrames = "1"
Dates = "<0.0.1, 1"
HTTP = "1"
Term = "2"
ProgressMeter = "1.10"
Test = "<0.0.1, 1"
ZipFile = "0.10"
julia = "1.6"
Expand Down
13 changes: 1 addition & 12 deletions src/ScrapeSEC.jl
Original file line number Diff line number Diff line change
@@ -1,24 +1,13 @@

module ScrapeSEC

# Dependencies
using DataFrames
using Dates: Dates
using CSV: CSV
using HTTP: HTTP
using Term.Progress
using ProgressMeter
using ZipFile: ZipFile

const progress_bar_columns = [
Progress.DescriptionColumn,
Progress.SeparatorColumn,
Progress.ProgressColumn,
Progress.CompletedColumn,
Progress.SeparatorColumn,
Progress.ETAColumn,
]

# source files
include("download_metadata.jl")
include("main_index.jl")
include("download_filings.jl")
Expand Down
55 changes: 21 additions & 34 deletions src/download_filings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ function download_filing(
mkdir(company_folder)
end

f = open(new_file, "w")
write(f, clean_text(text))
close(f)
open(new_file, "w") do f
write(f, clean_text(text))
end

return nothing
end
Expand All @@ -59,8 +59,6 @@ Parameters
* `dest`: Destination folder for downloaded filings
* `download_rate`: Number of filings to download every second (limit=10)
* `skip_file`: If true, existing files will be skipped
* `pbar`: ProgressBar (Term.jl)
* `stop_pbar`: If false, progress bar will not be stopped
* `pbar_desc`: pbar Description
* `runnings_tests`: If true, only downloads one file
* `clean_text`: function to clean text before writing to file
Expand All @@ -70,8 +68,6 @@ function download_filings(
dest="./data/"::String,
download_rate=10::Int,
skip_file=true::Bool,
pbar=ProgressBar(;)::ProgressBar,
stop_pbar=true::Bool,
pbar_desc="Downloading Filings"::String,
running_tests=false::Bool,
clean_text::Function=_pass_text,
Expand All @@ -92,27 +88,28 @@ function download_filings(
# download filings at 10 requests per second
sleep_time = 1 / download_rate

job = addjob!(pbar; N=size(filenames, 1), description=pbar_desc)
start!(pbar)
if skip_file
filenames = filter(file -> !isfile(joinpath(dest, replace(file, "edgar/data/" => ""))), filenames)
end

if isempty(filenames)
return nothing
end

p = Progress(size(filenames, 1); desc=pbar_desc)
for file in filenames
full_file = joinpath(dest, replace(file, "edgar/data/" => ""))
if isfile(full_file) && skip_file
continue
end

@async download_filing(file, full_file, dest; clean_text)

update!(job)
next!(p)
sleep(sleep_time)
render(pbar)

if running_tests
break
end
end
if stop_pbar
stop!(pbar)
end
finish!(p)

return nothing
end
Expand Down Expand Up @@ -141,8 +138,6 @@ Parameters
* `filing_types`: Types of filings to download (eg. ["10-K", "10-Q"])
* `download_rate`: Number of filings to download every second (limit=10)
* `skip_file`: If true, existing files will be skipped
* `pbar`: ProgressBar (Term.jl)
* `stop_pbar`: If false, progress bar will not be stopped
* `pbar_desc`: pbar Description
* `runnings_tests`: If true, only downloads one file
* `clean_text`: function to clean text before writing to file
Expand All @@ -153,8 +148,6 @@ function download_filings(
filing_types=["10-K"]::Vector{String},
download_rate=10::Int,
skip_file=true::Bool,
pbar=ProgressBar(;)::ProgressBar,
stop_pbar=true::Bool,
pbar_desc="Downloading Filings"::String,
running_tests=false::Bool,
clean_text::Function=_pass_text,
Expand All @@ -169,15 +162,17 @@ function download_filings(
end

df = DataFrame(CSV.File(metadata_file; delim="|"))
if isempty(df)
@warn "No filings found in metadata file: $metadata_file"
return nothing
end
df = df[(filing_types).(df[!, "Form Type"]), :]

download_filings(
df.Filename;
dest=dest,
download_rate=download_rate,
skip_file=skip_file,
pbar=pbar,
stop_pbar=stop_pbar,
pbar_desc=pbar_desc,
running_tests=running_tests,
clean_text,
Expand Down Expand Up @@ -252,12 +247,7 @@ function download_filings(
dest=metadata_dest,
skip_file=skip_metadata_file,
)

pbar = ProgressBar(; columns=progress_bar_columns)
job = addjob!(
pbar; N=size(time_periods, 1), description="Iterating Over Time Periods..."
)
start!(pbar)
p = Progress(size(time_periods, 1); desc="Iterating Over Time Periods...")
for t in time_periods
file = joinpath(metadata_dest, string(t[1]) * "-QTR" * string(t[2]) * ".tsv")
download_filings(
Expand All @@ -266,16 +256,13 @@ function download_filings(
filing_types=filing_types,
download_rate=download_rate,
skip_file=skip_file,
pbar=pbar,
stop_pbar=false,
pbar_desc="Downloading $(t[1]) Q$(t[2]) Filings",
running_tests=running_tests,
clean_text,
)
update!(job)
render(pbar)
next!(p)
end
stop!(pbar)
finish!(p)

return nothing
end
39 changes: 18 additions & 21 deletions src/download_metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,27 +62,27 @@ function download_metadata(

HTTP.download(url, temp_zip; update_period=Inf)
zarchive = ZipFile.Reader(temp_zip)
for f in zarchive.files
@assert f.name == "master.idx"
out = open(temp_file, "w")
write(out, read(f, String))
close(out)
for zip_file in zarchive.files
@assert zip_file.name == "master.idx"
open(temp_file, "w") do f
write(f, read(zip_file, String))
end
end
close(zarchive)
rm(temp_zip)

f = open(temp_file, "r")
metadata = readlines(f)[10:end] # skip fluff at top
close(f)
metadata = open(temp_file, "r") do f
readlines(f)[10:end] # skip fluff at top
end
rm(temp_file)

f = open(full_file, "w")
for line in metadata
if occursin("|", line) # skip "----------" line
write(f, line * "\n")
open(full_file, "w") do f
for line in metadata
if occursin("|", line) # skip "----------" line
write(f, line * "\n")
end
end
end
close(f)

return nothing
end
Expand Down Expand Up @@ -136,17 +136,14 @@ function download_metadata_files(

urls = get_metadata_urls(time_periods)
n_files = size(urls, 1)
pbar = ProgressBar(;)
job = addjob!(pbar; N=n_files, description="Downloading Metadata CSVs...")
start!(pbar)
@inbounds for idx in eachindex(urls)
update!(job)
p = Progress(n_files; desc="Downloading Metadata CSVs...")
for url in urls
ScrapeSEC.download_metadata(
urls[idx]; dest=dest, skip_file=skip_file, verbose=verbose
url; dest=dest, skip_file=skip_file, verbose=verbose
)
render(pbar)
next!(p)
end
stop!(pbar)
finish!(p)

return nothing
end
5 changes: 5 additions & 0 deletions test/download_filings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ end
running_tests=true,
)
@test isfile("./metadata/1994-QTR4.tsv")
rm("./metadata/1994-QTR4.tsv")

# Test when metadata files are empty and no filings are downloaded
download_filings(1994, 1994; filing_types=["40-F"])

rm("./metadata/1994-QTR4.tsv")
# TODO: Is it safe to clear the temp dir? I dont want to accidently user files
end

0 comments on commit f058add

Please sign in to comment.