diff --git a/Project.toml b/Project.toml index 39b63b6..06393b6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,14 +1,14 @@ name = "ScrapeSEC" uuid = "856806e7-be2f-4540-8165-3a51303b7af0" authors = ["tylerjthomas9 "] -version = "1.0.1" +version = "1.1.0" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" -Term = "22787eb5-b846-44ae-b979-8e399b8463ab" +ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [compat] @@ -17,7 +17,7 @@ CSV = "0.10" DataFrames = "1" Dates = "<0.0.1, 1" HTTP = "1" -Term = "2" +ProgressMeter = "1.10" Test = "<0.0.1, 1" ZipFile = "0.10" julia = "1.6" diff --git a/src/ScrapeSEC.jl b/src/ScrapeSEC.jl index db3fcd0..9ffbab0 100644 --- a/src/ScrapeSEC.jl +++ b/src/ScrapeSEC.jl @@ -1,24 +1,13 @@ module ScrapeSEC -# Dependencies using DataFrames using Dates: Dates using CSV: CSV using HTTP: HTTP -using Term.Progress +using ProgressMeter using ZipFile: ZipFile -const progress_bar_columns = [ - Progress.DescriptionColumn, - Progress.SeparatorColumn, - Progress.ProgressColumn, - Progress.CompletedColumn, - Progress.SeparatorColumn, - Progress.ETAColumn, -] - -# source files include("download_metadata.jl") include("main_index.jl") include("download_filings.jl") diff --git a/src/download_filings.jl b/src/download_filings.jl index bdc3488..38a3ede 100644 --- a/src/download_filings.jl +++ b/src/download_filings.jl @@ -30,9 +30,9 @@ function download_filing( mkdir(company_folder) end - f = open(new_file, "w") - write(f, clean_text(text)) - close(f) + open(new_file, "w") do f + write(f, clean_text(text)) + end return nothing end @@ -59,8 +59,6 @@ Parameters * `dest`: Destination folder for downloaded filings * `download_rate`: Number of filings to download every second (limit=10) * `skip_file`: If true, existing files will be skipped -* `pbar`: ProgressBar (Term.jl) -* `stop_pbar`: If false, progress bar will not be stopped * `pbar_desc`: pbar Description * `runnings_tests`: If true, only downloads one file * `clean_text`: function to clean text before writing to file @@ -70,8 +68,6 @@ function download_filings( dest="./data/"::String, download_rate=10::Int, skip_file=true::Bool, - pbar=ProgressBar(;)::ProgressBar, - stop_pbar=true::Bool, pbar_desc="Downloading Filings"::String, running_tests=false::Bool, clean_text::Function=_pass_text, @@ -92,27 +88,28 @@ function download_filings( # download filings at 10 requests per second sleep_time = 1 / download_rate - job = addjob!(pbar; N=size(filenames, 1), description=pbar_desc) - start!(pbar) + if skip_file + filenames = filter(file -> !isfile(joinpath(dest, replace(file, "edgar/data/" => ""))), filenames) + end + + if isempty(filenames) + return nothing + end + + p = Progress(size(filenames, 1); desc=pbar_desc) for file in filenames full_file = joinpath(dest, replace(file, "edgar/data/" => "")) - if isfile(full_file) && skip_file - continue - end @async download_filing(file, full_file, dest; clean_text) - update!(job) + next!(p) sleep(sleep_time) - render(pbar) if running_tests break end end - if stop_pbar - stop!(pbar) - end + finish!(p) return nothing end @@ -141,8 +138,6 @@ Parameters * `filing_types`: Types of filings to download (eg. ["10-K", "10-Q"]) * `download_rate`: Number of filings to download every second (limit=10) * `skip_file`: If true, existing files will be skipped -* `pbar`: ProgressBar (Term.jl) -* `stop_pbar`: If false, progress bar will not be stopped * `pbar_desc`: pbar Description * `runnings_tests`: If true, only downloads one file * `clean_text`: function to clean text before writing to file @@ -153,8 +148,6 @@ function download_filings( filing_types=["10-K"]::Vector{String}, download_rate=10::Int, skip_file=true::Bool, - pbar=ProgressBar(;)::ProgressBar, - stop_pbar=true::Bool, pbar_desc="Downloading Filings"::String, running_tests=false::Bool, clean_text::Function=_pass_text, @@ -169,6 +162,10 @@ function download_filings( end df = DataFrame(CSV.File(metadata_file; delim="|")) + if isempty(df) + @warn "No filings found in metadata file: $metadata_file" + return nothing + end df = df[∈(filing_types).(df[!, "Form Type"]), :] download_filings( @@ -176,8 +173,6 @@ function download_filings( dest=dest, download_rate=download_rate, skip_file=skip_file, - pbar=pbar, - stop_pbar=stop_pbar, pbar_desc=pbar_desc, running_tests=running_tests, clean_text, @@ -252,12 +247,7 @@ function download_filings( dest=metadata_dest, skip_file=skip_metadata_file, ) - - pbar = ProgressBar(; columns=progress_bar_columns) - job = addjob!( - pbar; N=size(time_periods, 1), description="Iterating Over Time Periods..." - ) - start!(pbar) + p = Progress(size(time_periods, 1); desc="Iterating Over Time Periods...") for t in time_periods file = joinpath(metadata_dest, string(t[1]) * "-QTR" * string(t[2]) * ".tsv") download_filings( @@ -266,16 +256,13 @@ function download_filings( filing_types=filing_types, download_rate=download_rate, skip_file=skip_file, - pbar=pbar, - stop_pbar=false, pbar_desc="Downloading $(t[1]) Q$(t[2]) Filings", running_tests=running_tests, clean_text, ) - update!(job) - render(pbar) + next!(p) end - stop!(pbar) + finish!(p) return nothing end diff --git a/src/download_metadata.jl b/src/download_metadata.jl index 3a5c092..edbfec8 100644 --- a/src/download_metadata.jl +++ b/src/download_metadata.jl @@ -62,27 +62,27 @@ function download_metadata( HTTP.download(url, temp_zip; update_period=Inf) zarchive = ZipFile.Reader(temp_zip) - for f in zarchive.files - @assert f.name == "master.idx" - out = open(temp_file, "w") - write(out, read(f, String)) - close(out) + for zip_file in zarchive.files + @assert zip_file.name == "master.idx" + open(temp_file, "w") do f + write(f, read(zip_file, String)) + end end close(zarchive) rm(temp_zip) - f = open(temp_file, "r") - metadata = readlines(f)[10:end] # skip fluff at top - close(f) + metadata = open(temp_file, "r") do f + readlines(f)[10:end] # skip fluff at top + end rm(temp_file) - f = open(full_file, "w") - for line in metadata - if occursin("|", line) # skip "----------" line - write(f, line * "\n") + open(full_file, "w") do f + for line in metadata + if occursin("|", line) # skip "----------" line + write(f, line * "\n") + end end end - close(f) return nothing end @@ -136,17 +136,14 @@ function download_metadata_files( urls = get_metadata_urls(time_periods) n_files = size(urls, 1) - pbar = ProgressBar(;) - job = addjob!(pbar; N=n_files, description="Downloading Metadata CSVs...") - start!(pbar) - @inbounds for idx in eachindex(urls) - update!(job) + p = Progress(n_files; desc="Downloading Metadata CSVs...") + for url in urls ScrapeSEC.download_metadata( - urls[idx]; dest=dest, skip_file=skip_file, verbose=verbose + url; dest=dest, skip_file=skip_file, verbose=verbose ) - render(pbar) + next!(p) end - stop!(pbar) + finish!(p) return nothing end diff --git a/test/download_filings.jl b/test/download_filings.jl index 7e7e2ac..24355a8 100644 --- a/test/download_filings.jl +++ b/test/download_filings.jl @@ -17,6 +17,11 @@ end running_tests=true, ) @test isfile("./metadata/1994-QTR4.tsv") + rm("./metadata/1994-QTR4.tsv") + + # Test when metadata files are empty and no filings are downloaded + download_filings(1994, 1994; filing_types=["40-F"]) + rm("./metadata/1994-QTR4.tsv") # TODO: Is it safe to clear the temp dir? I dont want to accidently user files end