Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Cherng <[email protected]>
  • Loading branch information
jfcherng committed Feb 18, 2024
1 parent 26a6fef commit f20293b
Show file tree
Hide file tree
Showing 16 changed files with 479 additions and 8 deletions.
19 changes: 14 additions & 5 deletions AutoSetSyntax.sublime-settings
Original file line number Diff line number Diff line change
Expand Up @@ -643,11 +643,20 @@
}
],

///////////////////////////////////////
// Guesslang settings (experimental) //
/////////////////////////////////////////////////////////////////////////
// You have to restart ST after modifying any of guesslang's settings. //
/////////////////////////////////////////////////////////////////////////
/////////////////////
// Magika settings //
/////////////////////

// To use this feature, you have to install the "magika" library.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/dl-based-syntax-detection/#prerequisites
"magika.enabled": true,
"magika.min_confidence": 0.85,
// @see https://github.com/google/magika/blob/9e733e847ea0d93ea100d5d478a4b54c3ec5fd1c/docs/supported-content-types-list.md
"magika.syntax_map": {
"rs": ["scope:source.rust"],
// ...
"rust": ["=rs"],
},

// To use this feature, you have to install the server.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/experimental/ml-based-syntax-detection/#prerequisites
Expand Down
9 changes: 8 additions & 1 deletion plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import importlib.machinery
import pkgutil
import sys
from pathlib import Path

import sublime
Expand All @@ -15,7 +16,7 @@
AutoSetSyntaxRestartGuesslangCommand,
run_auto_set_syntax_on_view,
)
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME, PLUGIN_PY_LIBS_DIR
from .listener import (
AutoSetSyntaxEventListener,
AutoSetSyntaxTextChangeListener,
Expand Down Expand Up @@ -67,6 +68,7 @@ def plugin_loaded() -> None:


def _plugin_loaded() -> None:
_add_python_lib_path()
_load_custom_implementations()

AioSettings.plugin_name = PLUGIN_NAME
Expand Down Expand Up @@ -98,6 +100,11 @@ def _settings_changed_callback(window: sublime.Window) -> None:
compile_rules(window, is_update=True)


def _add_python_lib_path() -> None:
if (path := str(PLUGIN_PY_LIBS_DIR)) not in sys.path:
sys.path.insert(0, path)


def _load_custom_implementations() -> None:
for finder, name, _ in pkgutil.iter_modules(map(str, PLUGIN_CUSTOM_MODULE_PATHS.values())):
assert isinstance(finder, importlib.machinery.FileFinder)
Expand Down
57 changes: 56 additions & 1 deletion plugin/commands/auto_set_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..constants import PLUGIN_NAME, RE_ST_SYNTAX_TEST_LINE, RE_VIM_SYNTAX_LINE
from ..guesslang.types import GuesslangServerPredictionItem, GuesslangServerResponse
from ..helpers import is_syntaxable_view
from ..helpers import is_syntaxable_view, resolve_magika_label_with_syntax_map
from ..libs import websocket
from ..logger import Logger
from ..rules import SyntaxRuleCollection
Expand Down Expand Up @@ -194,6 +194,17 @@ def run_auto_set_syntax_on_view(
} and _assign_syntax_with_trimmed_filename(view, event):
return True

if event in {
ListenerEvent.COMMAND,
ListenerEvent.INIT,
ListenerEvent.LOAD,
ListenerEvent.MODIFY,
ListenerEvent.PASTE,
ListenerEvent.SAVE,
ListenerEvent.UNTRANSIENTIZE,
} and _assign_syntax_with_magika(view, event):
return True

if _assign_syntax_with_heuristics(view, event):
return True

Expand Down Expand Up @@ -374,6 +385,50 @@ def is_json(view: sublime.View) -> bool:
return False


def _assign_syntax_with_magika(view: sublime.View, event: ListenerEvent | None = None) -> bool:
if not (
(window := view.window())
and (settings := get_merged_plugin_settings(window=window))
and settings.get("magika.enabled")
and (view_snapshot := G.view_snapshot_collection.get_by_view(view))
# don't apply on those have an extension
and (event == ListenerEvent.COMMAND or "." not in view_snapshot.file_name_unhidden)
# only apply on plain text syntax
and ((syntax := view_snapshot.syntax) and is_plaintext_syntax(syntax))
# we don't want to use AI model during typing when there is only one line
# that may result in unwanted behavior such as a new buffer may be assigned to Python
# right after "import" is typed but it could be JavaScript or TypeScript as well
and (event != ListenerEvent.MODIFY or "\n" in view_snapshot.content)
):
return False

try:
from magika import Magika
except ImportError as e:
Logger.log(f"💣 Error occured when importing Magika: {e}", window=window)
return False

classifier = Magika()
output = classifier.identify_bytes(view_snapshot.content.encode()).output
Logger.log(f"🐛 Magika's prediction: {output}", window=window)

threadshold: float = settings.get("magika.min_confidence", 0.0)
if output.score < threadshold or output.ct_label in {"empty", "txt", "unknown"}:
return False

syntax_map: dict[str, list[str]] = settings.get("magika.syntax_map", {})
if not (syntax_likes := resolve_magika_label_with_syntax_map(output.ct_label, syntax_map)):
Logger.log(f'🤔 Unknown "label" from Magika: {output.ct_label}', window=window)
return False

if not (syntax := find_syntax_by_syntax_likes(syntax_likes, include_plaintext=False)):
Logger.log(f"😢 Failed finding syntax from Magika: {syntax_likes}", window=window)
return False

sublime.status_message(f"Predicted syntax: {syntax.name} ({round(output.score * 100, 2)}% confidence)")
return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"})


def _assign_syntax_with_guesslang_async(view: sublime.View, event: ListenerEvent | None = None) -> None:
if not (
G.guesslang_client
Expand Down
157 changes: 157 additions & 0 deletions plugin/commands/auto_set_syntax_download_dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from __future__ import annotations

import gzip
import tarfile
import threading
import time
import urllib.request
import zipfile
from collections.abc import Iterable
from pathlib import Path
from typing import Union

import sublime
import sublime_plugin

from ..constants import GUESSLANG_SERVER_URL, PLUGIN_NAME
from ..guesslang.server import GuesslangServer
from ..settings import get_merged_plugin_setting
from ..shared import G
from ..utils import first_true, rmtree_ex

PathLike = Union[Path, str]


class AutoSetSyntaxDownloadDependenciesCommand(sublime_plugin.ApplicationCommand):
# Server codes are published on https://github.com/jfcherng-sublime/ST-AutoSetSyntax/tree/dependencies

def description(self) -> str:
return f"{PLUGIN_NAME}: Download Dependencies"

def run(self) -> None:
self.t = threading.Thread(target=self._worker)
self.t.start()

@classmethod
def _worker(cls) -> None:
sublime.status_message("Begin downloading guesslang server...")

if (server := G.guesslang_server) and server.is_running():
server.stop()
time.sleep(1) # wait for stopping the server

rmtree_ex(GuesslangServer.SERVER_DIR, ignore_errors=True)

try:
cls._prepare_bin()

if not GuesslangServer.SERVER_FILE.is_file():
sublime.error_message(f"[{PLUGIN_NAME}] Cannot find the server: {str(GuesslangServer.SERVER_FILE)}")

if get_merged_plugin_setting("guesslang.enabled"):
sublime.run_command("auto_set_syntax_restart_guesslang")

sublime.message_dialog(f"[{PLUGIN_NAME}] Finish downloading guesslang server!")
except Exception as e:
sublime.error_message(f"[{PLUGIN_NAME}] {e}")

@staticmethod
def _prepare_bin() -> None:
zip_path = GuesslangServer.SERVER_DIR / "source.zip"
download_file(GUESSLANG_SERVER_URL, zip_path)
decompress_file(zip_path)

# get the folder, which is just decompressed
folder = first_true(
sorted(
filter(Path.is_dir, zip_path.parent.iterdir()),
key=lambda p: p.stat().st_mtime,
reverse=True,
),
)

if not folder:
return

# move the decompressed folder one level up
guesslang_server_dir = folder.parent
tmp_dir = guesslang_server_dir.parent / ".tmp"
rmtree_ex(tmp_dir, ignore_errors=True)
folder.replace(tmp_dir)
rmtree_ex(guesslang_server_dir, ignore_errors=True)
tmp_dir.replace(guesslang_server_dir)
# cleanup
zip_path.unlink(missing_ok=True)


def decompress_file(tarball: PathLike, dst_dir: PathLike | None = None) -> bool:
"""
Decompress the tarball.
:param tarball: The tarball
:param dst_dir: The destination directory
:returns: Successfully decompressed the tarball or not
"""

def tar_safe_extract(
tar: tarfile.TarFile,
path: PathLike = ".",
members: Iterable[tarfile.TarInfo] | None = None,
*,
numeric_owner: bool = False,
) -> None:
path = Path(path).resolve()
for member in tar.getmembers():
member_path = (path / member.name).resolve()
if path not in member_path.parents:
raise Exception("Attempted Path Traversal in Tar File")

tar.extractall(path, members, numeric_owner=numeric_owner)

tarball = Path(tarball)
dst_dir = Path(dst_dir) if dst_dir else tarball.parent
filename = tarball.name

try:
if filename.endswith(".tar.gz"):
with tarfile.open(tarball, "r:gz") as f_1:
tar_safe_extract(f_1, dst_dir)
return True

if filename.endswith(".tar"):
with tarfile.open(tarball, "r:") as f_2:
tar_safe_extract(f_2, dst_dir)
return True

if filename.endswith(".zip"):
with zipfile.ZipFile(tarball) as f_3:
f_3.extractall(dst_dir)
return True
except Exception:
pass
return False


def download_file(url: str, save_path: PathLike) -> None:
"""
Downloads a file.
:param url: The url
:param save_path: The path of the saved file
"""

save_path = Path(save_path)
save_path.unlink(missing_ok=True)
save_path.parent.mkdir(parents=True, exist_ok=True)
save_path.write_bytes(simple_urlopen(url))


def simple_urlopen(url: str, chunk_size: int = 512 * 1024) -> bytes:
response = urllib.request.urlopen(url)
data = b""
while chunk := response.read(chunk_size):
data += chunk
if response.info().get("Content-Encoding") == "gzip":
data = gzip.decompress(data)
return data
1 change: 1 addition & 0 deletions plugin/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
PLUGIN_NAME = __package__.partition(".")[0] # like "AutoSetSyntax"

PLUGIN_STORAGE_DIR = Path(sublime.cache_path()).parent / f"Package Storage/{PLUGIN_NAME}"
PLUGIN_PY_LIBS_DIR = PLUGIN_STORAGE_DIR / f"libs-py38@{ST_PLATFORM_ARCH}"
PLUGIN_CUSTOM_DIR = Path(sublime.packages_path()) / f"{PLUGIN_NAME}-Custom"
PLUGIN_CUSTOM_MODULE_PATHS = {
"constraint": PLUGIN_CUSTOM_DIR / "constraints",
Expand Down
17 changes: 16 additions & 1 deletion plugin/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sublime

from .settings import get_st_setting
from .utils import is_plaintext_syntax, is_transient_view
from .utils import is_plaintext_syntax, is_transient_view, stable_unique


def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool:
Expand All @@ -15,3 +15,18 @@ def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool
and (not must_plaintext or ((syntax := view.syntax()) and is_plaintext_syntax(syntax)))
and ((size_max := get_st_setting("syntax_detection_size_limit", 0)) == 0 or size_max >= view.size())
)


def resolve_magika_label_with_syntax_map(label: str, syntax_map: dict[str, list[str]]) -> list[str]:
res: list[str] = []
queue: list[str] = syntax_map.get(label, []).copy()

# @todo what if there are circular references?
while queue:
syntax_like = queue.pop()
if syntax_like.startswith("="):
queue.extend(syntax_map.get(syntax_like[1:], []))
continue
res.append(syntax_like)

return list(stable_unique(reversed(res)))
15 changes: 15 additions & 0 deletions tests/files/this-is-rust
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// This is a comment, and is ignored by the compiler.
// You can test this code by clicking the "Run" button over there ->
// or if you prefer to use your keyboard, you can use the "Ctrl + Enter"
// shortcut.

// This code is editable, feel free to hack it!
// You can always return to the original code by clicking the "Reset" button ->

// This is the main function.
fn main() {
// Statements here are executed when the compiled binary is called.

// Print text to the console.
println!("Hello World!");
}
5 changes: 5 additions & 0 deletions typings/magika/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from magika import magika as magika, prediction_mode as prediction_mode

Magika = magika.Magika
MagikaError = magika.MagikaError
PredictionMode = prediction_mode.PredictionMode
19 changes: 19 additions & 0 deletions typings/magika/cli/magika.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from _typeshed import Incomplete
from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors
from magika.content_types import ContentTypesManager as ContentTypesManager
from magika.logger import get_logger as get_logger
from magika.types import FeedbackReport as FeedbackReport, MagikaResult as MagikaResult
from pathlib import Path
from typing import List, Optional

VERSION: str
CONTACT_EMAIL: str
CONTEXT_SETTINGS: Incomplete
HELP_EPILOG: Incomplete

def main(file: List[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, generate_report_flag: bool, output_version: bool, list_output_content_types: bool, model_dir: Optional[Path]) -> None: ...
def should_read_from_stdin(files_paths: List[Path]) -> bool: ...
def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: ...
def generate_feedback_report(magika: Magika, file_path: Path, magika_result: MagikaResult) -> FeedbackReport: ...
def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: ...
def print_output_content_types_list() -> None: ...
17 changes: 17 additions & 0 deletions typings/magika/colors.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
BLACK: str
RED: str
GREEN: str
YELLOW: str
BLUE: str
PURPLE: str
CYAN: str
LIGHT_GRAY: str
DARK_GRAY: str
LIGHT_RED: str
LIGHT_GREEN: str
LIGHT_YELLOW: str
LIGHT_BLUE: str
LIGHT_PURPLE: str
LIGHT_CYAN: str
WHITE: str
RESET: str
Loading

0 comments on commit f20293b

Please sign in to comment.