Police-Data-Accessibility-Project · mbodeantor · Dec 20, 2023 · Dec 11, 2023 · Dec 13, 2023 · Dec 14, 2023
@@ -0,0 +1,60 @@
+# Lake County Sheriff Scraper
+
+## Source info
+
+Records related to the Lake County Sheriff as part of California's SB 1421.
+
+The records can be found online at [https://www.lakesheriff.com/969/Use-of-Force](https://www.lakesheriff.com/969/Use-of-Force)
+
+## Content warning
+
+Some of the videos and images contain graphic displays of police violence and bodily harm. Viewer discretion is advised.
+
+## Storage and execution time
+
+This scraper requires at least 14 GB of available disk space for all files and takes ~23 minutes to complete; dependant on disk and network speed.
+
+## Content redaction
+
+Video, audio, documents, and images may contain redacted data to protect the privacy of those involved.
+
+## Objectively Reasonable
+
+>The legal standard used to determine the lawfulness and appropriateness of a use of force is the Fourth Amendment to the United States Constitution. See Graham versus Connor, 490 U.S. 386 (1989). Graham states in part, The reasonableness of a particular use of force must be judged from the perspective of a reasonable officer on the scene, rather than with the 20/20 vision of hindsight. The calculus of reasonableness must embody allowance for the fact that police officers are often forced to make split-second judgments - in circumstances that are tense, uncertain, and rapidly evolving - about the amount of force that is necessary in a particular situation. The test of reasonableness is not capable of precise definition or mechanical application. The force must be reasonable under the circumstances known to the officer at the time the force was used. Therefore, the Sheriff's Office examines all uses of force from an objective standard, rather than a subjective standard.
+
+## Sample response
+
+Sample response is ommitted due to many of the files being large in size.
+
+The final folder structure is as follows:
+
+- `./data/Case 21438/`
+- `./data/Case 23408/`
+- `./data/Case 01070402/`
+- `./data/Case 08020293/`
+- `./data/Case 10080048/`
+- `./data/Case 14010032/`
+- `./data/Case 14110123/`
+- `./data/Case 15020285/`
+- `./data/Case 17030017/`
+- `./data/Case 18020066/`
+- `./data/Case 19070164/`
+- `./data/Case 19120322/`
+- `./data/Case 20020144/`
+- `./data/Case 20120287/`
+- `./data/Case 21050095/`
+- `./data/Case 21090240/`
+- `./data/Case 22010120/`
+- `./data/Case 23110157/`
+- `./data/IA 2018-0023/`
+
+## Requirements
+
+- `Python 3`
+- `requests`
+- `tqdm`
+- `m3u8`
+- `pytube`
+- `BeautifulSoup4`
+- `inputtimeout`
+- `from_root`
@@ -0,0 +1,144 @@
+import os
+import sys
+
+import requests
+from tqdm import tqdm
+from bs4 import BeautifulSoup
+from from_root import from_root
+
+p = from_root("CONTRIBUTING.md").parent
+sys.path.insert(1, str(p))
+
+from utils.video.video_downloaders import youtube_downloader
+
+
+def get_case_media(url):
+    """Downloads all media files linked on a case's page.
+
+    Args:
+        url (str): Url of the page where the case media is linked.
+    """
+    r = requests.get(url)
+
+    soup = BeautifulSoup(r.content, "html.parser")
+    title = soup.find("h1", id="versionHeadLine").text.strip()
+    a_list = soup.find(class_="moduleContentNew").find_all("a")
+
+    print(f"Retrieving {title} media...")
+
+    for a in a_list:
+        filename = a.text
+        savedir = f"./data/{title}/"
+
+        if "YouTube" in filename:
+            youtube_downloader(a["href"], savedir)
+            continue
+        elif "Photo Gallery" in filename:
+            get_photo_gallery(savedir)
+            continue
+
+        filetype = ""
+        if (
+            filename.endswith("(PDF)")
+            or filename.endswith("(MP4)")
+            or filename.endswith("(MP3)")
+            or filename.upper().endswith("(WAV)")
+        ):
+            # Grab the last part of the filename to be used as the file extention
+            filetype = "." + filename[len(filename) - 4 : len(filename) - 1].lower()
+        elif filename.endswith("(VID)"):
+            if "IA 2018-0023" in title:
+                filetype = ".mp4"
+            else:
+                filetype = ".vob"
+        elif filename.endswith("(audio only)"):
+            filename = filename + ".mp3"
+        else:
+            # Retrieve webpage as an html
+            webpage_url = a["href"]
+            filename = filename + ".html"
+            download_file(webpage_url, savedir=savedir, filename=filename)
+            continue
+
+        if not filename.endswith("(audio only).mp3"):
+            # Remove the file extension from the last part of the filename
+            filename = filename[: len(filename) - 6] + filetype
+
+        download_url = "https://www.lakesheriff.com" + a["href"]
+
+        download_file(download_url, savedir=savedir, filename=filename)
+
+
+def get_photo_gallery(savedir):
+    """Retrieves all images from a photo gallery.
+
+    Args:
+        savedir (str): Directory where the images will be saved.
+    """
+    savedir = savedir + "Images/"
+
+    if "18020066" in savedir:
+        start = 3753
+        end = 3783
+    elif "14110123" in savedir:
+        start = 2573
+        end = 2876
+    else:
+        return
+
+    for p in tqdm(range(start, end), desc="Downloading image files"):
+        image_url = f"https://www.lakesheriff.com/ImageRepository/Document?documentID={p}"
+        filename = f"Image {p}.jpg"
+
+        download_file(image_url, savedir, filename, disable=True)
+
+
+def download_file(url, savedir, filename=None, disable=False):
+    """Downloads a file to a given directory.
+
+    Args:
+        url (str): Url of the file to download.
+        savedir (str): Directory where the file will be saved.
+        filename (str, optional): Name the file will be saved as. Defaults to last part of url.
+        disable (bool, optional): Whether or not to disable the progress bar in the command line. Defaults to False.
+    """
+    if filename is None:
+        filename = url.split("/")[-1]
+
+    if os.path.exists(savedir + filename):
+        if not disable:
+            print("File already exists: " + filename)
+        return
+
+    os.makedirs(savedir, exist_ok=True)
+
+    r = requests.get(url, stream=True)
+
+    total = int(r.headers.get("content-length", 0))
+    progress_bar = tqdm(total=total, unit="iB", unit_scale=True, desc=filename, disable=disable)
+
+    with open(savedir + filename, "wb") as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            progress_bar.update(len(chunk))
+            f.write(chunk)
+
+    progress_bar.close()
+
+
+def main():
+    homepage_url = "https://www.lakesheriff.com/969/Use-of-Force"
+    download_file(homepage_url, savedir="./data/", filename="Use of Force.html", disable=True)
+
+    r = requests.get(homepage_url)
+    soup = BeautifulSoup(r.content, "html.parser")
+    a_list = soup.find(class_="fr-alternate-rows").find_all("a")
+    a_list.reverse()
+
+    for a in a_list:
+        url = "https://www.lakesheriff.com" + a["href"]
+        get_case_media(url)
+        print()
+
+
+if __name__ == "__main__":
+    main()
@@ -4,8 +4,11 @@
 
 import requests
 import m3u8
+from inputimeout import inputimeout, TimeoutOccurred
 from tqdm import tqdm
+import pytube
 from pytube import YouTube
+from pytube.innertube import InnerTube
 
 
 def youtube_downloader(youtube_url, savedir, disable_progressbar=False):
@@ -17,22 +20,67 @@ def youtube_downloader(youtube_url, savedir, disable_progressbar=False):
         disable_progressbar (bool, optional): Whether to disable the progress bar in the command line. Default is False.
     """
     """Callaback function used to update the download progress bar."""
-    progress_callback = lambda stream, data_chunk, bytes_remaining: progress_bar.update(
-        len(data_chunk)
-    )
+    progress_callback = lambda stream, data_chunk, bytes_remaining: progress_bar.update(len(data_chunk))
 
-    yt = YouTube(youtube_url, on_progress_callback=progress_callback, disable=disable_progressbar)
+    yt = YouTube(youtube_url, on_progress_callback=progress_callback)
 
-    if os.path.exists(savedir + yt.title + ".mp4"):
+    filename = savedir + yt.title + ".mp4"
+    if os.path.exists(filename):
         return
 
-    stream = yt.streams.get_highest_resolution()
-
-    progress_bar = tqdm(
-        total=stream.filesize, unit="iB", unit_scale=True, desc=yt.title
-    )
-
-    stream.download(output_path=savedir)
+    try:
+        stream = yt.streams.get_highest_resolution()
+    except pytube.exceptions.AgeRestrictedError:
+        # Attempt to override YouTube's age restriction
+        yt = YouTube_Override(youtube_url, on_progress_callback=progress_callback)
+
+        try:
+            stream = yt.streams.get_highest_resolution()
+        except KeyError:
+            # Some video's age restriction is unable to be overridden and requires a sign in
+            print("This YouTube video is age restricted and requires that you sign in to YouTube to access it.")
+            print("Login will only be required once and will be cached for later.")
+            try:
+                signin = inputimeout(prompt="Would you like to sign in? (y/n): ", timeout=30)
+            except TimeoutOccurred:
+                signin = "n"
+                return
+
+            if signin.lower() == "y":
+                yt = YouTube_Override(youtube_url, on_progress_callback=progress_callback, use_oauth=True)
+                stream = yt.streams.get_highest_resolution()
+            else:
+                return
+
+    progress_bar = tqdm(total=stream.filesize, unit="iB", unit_scale=True, desc=yt.title, disable=disable_progressbar)
+
+    retries = 0
+    while retries < 5:
+        try:
+            stream.download(output_path=savedir)
+            break
+        except Exception as e:
+            print("Download failed, retrying...")
+            os.remove(filename.replace("#", ""))
+            retries = retries + 1
+
+
+class YouTube_Override(YouTube):
+    """Fixes an issue with PyTube that would fail to bypass age restrictions"""
+
+    def bypass_age_gate(self):
+        """Attempt to update the vid_info by bypassing the age gate."""
+        innertube = InnerTube(client="ANDROID", use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache)
+        innertube_response = innertube.player(self.video_id)
+
+        playability_status = innertube_response["playabilityStatus"].get("status", None)
+
+        # If we still can't access the video, raise an exception
+        # (tier 3 age restriction)
+        if playability_status == "UNPLAYABLE":
+            raise pytube.exceptions.AgeRestrictedError(self.video_id)
+
+        self._vid_info = innertube_response
 
 
 def ts_downloader(m3u8_url, savedir, filename, disable_progressbar=False):
@@ -59,8 +107,7 @@ def ts_downloader(m3u8_url, savedir, filename, disable_progressbar=False):
     # Download the individual segments
     with ThreadPoolExecutor(max_workers=6) as executor:
         future_to_url = [
-            executor.submit(download_file, url + seg["uri"], TS_DIR)
-            for seg in m3u8_master.data["segments"]
+            executor.submit(download_file, url + seg["uri"], TS_DIR) for seg in m3u8_master.data["segments"]
         ]
 
         for future in tqdm(