Skip to content

Commit

Permalink
Merge branch 'stealth_v2'
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Dec 26, 2024
2 parents 8dff3af + 9fd88d8 commit 889089c
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 54 deletions.
1 change: 1 addition & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ strict = True
warn_return_any = False
show_error_context = True
pretty = True
follow_untyped_imports = True

[mypy-docs.source.*]
ignore_errors = True
62 changes: 17 additions & 45 deletions playwrightcapture/capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@
import time

from base64 import b64decode
from dataclasses import dataclass
from io import BytesIO
from logging import LoggerAdapter, Logger
from tempfile import NamedTemporaryFile
from typing import Any, TypedDict, Literal, TYPE_CHECKING
from collections.abc import MutableMapping, Iterator
from collections.abc import MutableMapping
from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
from zipfile import ZipFile

Expand All @@ -31,7 +30,7 @@
from playwright._impl._errors import TargetClosedError
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
from playwright_stealth import Stealth # type: ignore[attr-defined]
from puremagic import PureError, from_string
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
Expand All @@ -54,8 +53,8 @@
BROWSER = Literal['chromium', 'firefox', 'webkit']

try:
import pydub # type: ignore[import-untyped]
from speech_recognition import Recognizer, AudioFile # type: ignore[import-untyped]
import pydub
from speech_recognition import Recognizer, AudioFile
CAN_SOLVE_CAPTCHA = True
except ImportError:
CAN_SOLVE_CAPTCHA = False
Expand Down Expand Up @@ -94,39 +93,6 @@ def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, Muta
# https://bot.incolumitas.com/
# https://fingerprintjs.github.io/BotD/main/

@dataclass
class PCStealthConfig(StealthConfig): # type: ignore[misc]

@property
def enabled_scripts(self) -> Iterator[str]:
self.chrome_app = True
self.chrome_csi = True
self.chrome_runtime = True
self.chrome_load_times = True
self.navigator_plugins = True
self.hairline = True
self.iframe_content_window = True
self.media_codecs = True

# permissions are handled directly in playwright
self.navigator_permissions = False
# Platform is correct now
self.navigator_platform = False
# probably useless, but it will fallback to 4 regardless
self.navigator_hardware_concurrency = 4
# Webgl vendor is correct now
self.webgl_vendor = False
# Set by the viewport
self.outerdimensions = False

# Not working with Playwright 1.45+
self.navigator_languages = False # Causes issue
self.navigator_user_agent = False # Causes issues
self.navigator_vendor = False # Causes issues

yield from super().enabled_scripts


class Capture():

_browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
Expand Down Expand Up @@ -460,6 +426,14 @@ async def initialize_context(self) -> None:
# record_video_dir='./videos/',
**device_context_settings
)

stealth = Stealth(
navigator_languages_override=(self.locale, self.locale.split('-')[0]) if self.locale else ("en-US", "en"),
navigator_user_agent_override=ua,
init_scripts_only=True
)
await stealth.apply_stealth_async(self.context)

self.context.set_default_timeout(self._capture_timeout * 1000)

if self.cookies:
Expand Down Expand Up @@ -849,8 +823,6 @@ async def store_request(request: Request) -> None:
await self.__dialog_clickthrough(page)
await self.__dialog_tarteaucitron_clickthrough(page)

await stealth_async(page, PCStealthConfig())

page.set_default_timeout((self._capture_timeout - 2) * 1000)
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
page.on("requestfinished", store_request)
Expand Down Expand Up @@ -1356,12 +1328,12 @@ async def _recaptcha_solver(self, page: Page) -> bool:
mp3_content = await response.read()
with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
mp3_file.write(mp3_content)
pydub.AudioSegment.from_mp3(mp3_file.name).export(wav_file.name, format="wav")
recognizer = Recognizer()
recaptcha_audio = AudioFile(wav_file.name)
pydub.AudioSegment.from_mp3(mp3_file.name).export(wav_file.name, format="wav") # type: ignore[attr-defined,no-untyped-call]
recognizer = Recognizer() # type: ignore[no-untyped-call]
recaptcha_audio = AudioFile(wav_file.name) # type: ignore[no-untyped-call]
with recaptcha_audio as source:
audio = recognizer.record(source)
text = recognizer.recognize_google(audio)
audio = recognizer.record(source) # type: ignore[no-untyped-call]
text = recognizer.recognize_google(audio) # type: ignore[attr-defined]
await main_frame.get_by_role("textbox", name="Enter what you hear").fill(text)
await main_frame.get_by_role("button", name="Verify").click()
await self._safe_wait(page, 5)
Expand Down
21 changes: 13 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ w3lib = "^2.2.1"
pydub = {version = "^0.25.1", optional = true}
SpeechRecognition = {version = ">=3.12.0", optional = true}
tzdata = "^2024.2"
playwright-stealth = "^1.0.6"
setuptools = "^75.6.0"
puremagic = "^1.28"
async-timeout = {version = "^4.0.3", python = "<3.11"}
aiohttp = {version = "^3.11.11", extras = ["speedups"]}
aiohttp-socks = "^0.10"
playwright-stealth = {git = "https://github.com/Mattwmaster58/playwright_stealth"}

[tool.poetry.extras]
recaptcha = ["pydub", "SpeechRecognition"]
Expand Down

0 comments on commit 889089c

Please sign in to comment.