Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lake County Sheriff scraper, YouTube downloader fixes #238

Merged
60 changes: 60 additions & 0 deletions scrapers_library/CA/lake_county/lake_county_sheriff/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Lake County Sheriff Scraper

## Source info

Records related to the Lake County Sheriff as part of California's SB 1421.

The records can be found online at [https://www.lakesheriff.com/969/Use-of-Force](https://www.lakesheriff.com/969/Use-of-Force)

## Content warning

Some of the videos and images contain graphic displays of police violence and bodily harm. Viewer discretion is advised.

## Storage and execution time

This scraper requires at least 14 GB of available disk space for all files and takes ~23 minutes to complete; dependant on disk and network speed.

## Content redaction

Video, audio, documents, and images may contain redacted data to protect the privacy of those involved.

## Objectively Reasonable

>The legal standard used to determine the lawfulness and appropriateness of a use of force is the Fourth Amendment to the United States Constitution. See Graham versus Connor, 490 U.S. 386 (1989). Graham states in part, The reasonableness of a particular use of force must be judged from the perspective of a reasonable officer on the scene, rather than with the 20/20 vision of hindsight. The calculus of reasonableness must embody allowance for the fact that police officers are often forced to make split-second judgments - in circumstances that are tense, uncertain, and rapidly evolving - about the amount of force that is necessary in a particular situation. The test of reasonableness is not capable of precise definition or mechanical application. The force must be reasonable under the circumstances known to the officer at the time the force was used. Therefore, the Sheriff's Office examines all uses of force from an objective standard, rather than a subjective standard.

## Sample response

Sample response is ommitted due to many of the files being large in size.

The final folder structure is as follows:

- `./data/Case 21438/`
- `./data/Case 23408/`
- `./data/Case 01070402/`
- `./data/Case 08020293/`
- `./data/Case 10080048/`
- `./data/Case 14010032/`
- `./data/Case 14110123/`
- `./data/Case 15020285/`
- `./data/Case 17030017/`
- `./data/Case 18020066/`
- `./data/Case 19070164/`
- `./data/Case 19120322/`
- `./data/Case 20020144/`
- `./data/Case 20120287/`
- `./data/Case 21050095/`
- `./data/Case 21090240/`
- `./data/Case 22010120/`
- `./data/Case 23110157/`
- `./data/IA 2018-0023/`

## Requirements

- `Python 3`
- `requests`
- `tqdm`
- `m3u8`
- `pytube`
- `BeautifulSoup4`
- `inputtimeout`
- `from_root`
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import os
import sys

import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from from_root import from_root

p = from_root("CONTRIBUTING.md").parent
sys.path.insert(1, str(p))

from utils.video.video_downloaders import youtube_downloader


def get_case_media(url):
"""Downloads all media files linked on a case's page.

Args:
url (str): Url of the page where the case media is linked.
"""
r = requests.get(url)

soup = BeautifulSoup(r.content, "html.parser")
title = soup.find("h1", id="versionHeadLine").text.strip()
a_list = soup.find(class_="moduleContentNew").find_all("a")

print(f"Retrieving {title} media...")

for a in a_list:
filename = a.text
savedir = f"./data/{title}/"

if "YouTube" in filename:
youtube_downloader(a["href"], savedir)
continue
elif "Photo Gallery" in filename:
get_photo_gallery(savedir)
continue

filetype = ""
if (
filename.endswith("(PDF)")
or filename.endswith("(MP4)")
or filename.endswith("(MP3)")
or filename.upper().endswith("(WAV)")
):
# Grab the last part of the filename to be used as the file extention
filetype = "." + filename[len(filename) - 4 : len(filename) - 1].lower()
elif filename.endswith("(VID)"):
if "IA 2018-0023" in title:
filetype = ".mp4"
else:
filetype = ".vob"
elif filename.endswith("(audio only)"):
filename = filename + ".mp3"
else:
# Retrieve webpage as an html
webpage_url = a["href"]
filename = filename + ".html"
download_file(webpage_url, savedir=savedir, filename=filename)
continue

if not filename.endswith("(audio only).mp3"):
# Remove the file extension from the last part of the filename
filename = filename[: len(filename) - 6] + filetype

download_url = "https://www.lakesheriff.com" + a["href"]

download_file(download_url, savedir=savedir, filename=filename)


def get_photo_gallery(savedir):
"""Retrieves all images from a photo gallery.

Args:
savedir (str): Directory where the images will be saved.
"""
savedir = savedir + "Images/"

if "18020066" in savedir:
start = 3753
end = 3783
elif "14110123" in savedir:
start = 2573
end = 2876
else:
return

for p in tqdm(range(start, end), desc="Downloading image files"):
image_url = f"https://www.lakesheriff.com/ImageRepository/Document?documentID={p}"
filename = f"Image {p}.jpg"

download_file(image_url, savedir, filename, disable=True)


def download_file(url, savedir, filename=None, disable=False):
"""Downloads a file to a given directory.

Args:
url (str): Url of the file to download.
savedir (str): Directory where the file will be saved.
filename (str, optional): Name the file will be saved as. Defaults to last part of url.
disable (bool, optional): Whether or not to disable the progress bar in the command line. Defaults to False.
"""
if filename is None:
filename = url.split("/")[-1]

if os.path.exists(savedir + filename):
if not disable:
print("File already exists: " + filename)
return

os.makedirs(savedir, exist_ok=True)

r = requests.get(url, stream=True)

total = int(r.headers.get("content-length", 0))
progress_bar = tqdm(total=total, unit="iB", unit_scale=True, desc=filename, disable=disable)

with open(savedir + filename, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
progress_bar.update(len(chunk))
f.write(chunk)

progress_bar.close()


def main():
homepage_url = "https://www.lakesheriff.com/969/Use-of-Force"
download_file(homepage_url, savedir="./data/", filename="Use of Force.html", disable=True)

r = requests.get(homepage_url)
soup = BeautifulSoup(r.content, "html.parser")
a_list = soup.find(class_="fr-alternate-rows").find_all("a")
a_list.reverse()

for a in a_list:
url = "https://www.lakesheriff.com" + a["href"]
get_case_media(url)
print()


if __name__ == "__main__":
main()
75 changes: 61 additions & 14 deletions utils/video/video_downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import requests
import m3u8
from inputimeout import inputimeout, TimeoutOccurred
from tqdm import tqdm
import pytube
from pytube import YouTube
from pytube.innertube import InnerTube


def youtube_downloader(youtube_url, savedir, disable_progressbar=False):
Expand All @@ -17,22 +20,67 @@ def youtube_downloader(youtube_url, savedir, disable_progressbar=False):
disable_progressbar (bool, optional): Whether to disable the progress bar in the command line. Default is False.
"""
"""Callaback function used to update the download progress bar."""
progress_callback = lambda stream, data_chunk, bytes_remaining: progress_bar.update(
len(data_chunk)
)
progress_callback = lambda stream, data_chunk, bytes_remaining: progress_bar.update(len(data_chunk))

yt = YouTube(youtube_url, on_progress_callback=progress_callback, disable=disable_progressbar)
yt = YouTube(youtube_url, on_progress_callback=progress_callback)

if os.path.exists(savedir + yt.title + ".mp4"):
filename = savedir + yt.title + ".mp4"
if os.path.exists(filename):
return

stream = yt.streams.get_highest_resolution()

progress_bar = tqdm(
total=stream.filesize, unit="iB", unit_scale=True, desc=yt.title
)

stream.download(output_path=savedir)
try:
stream = yt.streams.get_highest_resolution()
except pytube.exceptions.AgeRestrictedError:
# Attempt to override YouTube's age restriction
yt = YouTube_Override(youtube_url, on_progress_callback=progress_callback)

try:
stream = yt.streams.get_highest_resolution()
except KeyError:
# Some video's age restriction is unable to be overridden and requires a sign in
print("This YouTube video is age restricted and requires that you sign in to YouTube to access it.")
print("Login will only be required once and will be cached for later.")
try:
signin = inputimeout(prompt="Would you like to sign in? (y/n): ", timeout=30)
except TimeoutOccurred:
signin = "n"
return

if signin.lower() == "y":
yt = YouTube_Override(youtube_url, on_progress_callback=progress_callback, use_oauth=True)
stream = yt.streams.get_highest_resolution()
else:
return

progress_bar = tqdm(total=stream.filesize, unit="iB", unit_scale=True, desc=yt.title, disable=disable_progressbar)

retries = 0
while retries < 5:
try:
stream.download(output_path=savedir)
break
except Exception as e:
print("Download failed, retrying...")
os.remove(filename.replace("#", ""))
retries = retries + 1


class YouTube_Override(YouTube):
"""Fixes an issue with PyTube that would fail to bypass age restrictions"""

def bypass_age_gate(self):
"""Attempt to update the vid_info by bypassing the age gate."""
innertube = InnerTube(client="ANDROID", use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache)
innertube_response = innertube.player(self.video_id)

playability_status = innertube_response["playabilityStatus"].get("status", None)

# If we still can't access the video, raise an exception
# (tier 3 age restriction)
if playability_status == "UNPLAYABLE":
raise pytube.exceptions.AgeRestrictedError(self.video_id)

self._vid_info = innertube_response


def ts_downloader(m3u8_url, savedir, filename, disable_progressbar=False):
Expand All @@ -59,8 +107,7 @@ def ts_downloader(m3u8_url, savedir, filename, disable_progressbar=False):
# Download the individual segments
with ThreadPoolExecutor(max_workers=6) as executor:
future_to_url = [
executor.submit(download_file, url + seg["uri"], TS_DIR)
for seg in m3u8_master.data["segments"]
executor.submit(download_file, url + seg["uri"], TS_DIR) for seg in m3u8_master.data["segments"]
]

for future in tqdm(
Expand Down