Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Cherng <[email protected]>
  • Loading branch information
jfcherng committed Feb 18, 2024
1 parent 26a6fef commit 4fbad39
Show file tree
Hide file tree
Showing 18 changed files with 506 additions and 3 deletions.
59 changes: 59 additions & 0 deletions AutoSetSyntax.sublime-settings
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,65 @@
}
],

/////////////////////
// Magika settings //
/////////////////////

// To use this feature, you have to install the "magika" library.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/dl-based-syntax-detection/#prerequisites
"magika.enabled": true,
"magika.min_confidence": 0.85,
// @see https://github.com/google/magika/blob/9e733e847ea0d93ea100d5d478a4b54c3ec5fd1c/docs/supported-content-types-list.md
"magika.syntax_map": {
"asm": [
// no good way to do this?
"scope:source.asm.x86_64",
"scope:source.asm.arm",
"scope:source.rvasm",
"scope:source.assembly"
],
"asp": ["scope:source.asp"],
"batch": ["scope:source.dosbatch"],
"c": [
// C++ is basically a superset and magika can't distinguish C and C++
"scope:source.c++"
],
"cs": ["scope:source.cs"],
"css": ["scope:source.scss", "scope:source.css"],
"csv": ["scope:text.advanced_csv", "scope:text.csv"],
"go": ["scope:source.go"],
"html": ["scope:text.html.basic"],
"ini": ["scope:source.ini"],
"java": ["scope:source.java"],
"javascript": ["scope:source.js"],
"json": ["scope:source.json"],
"latex": ["scope:text.tex.latex"],
"lisp": ["scope:source.lisp"],
"m3u": ["scope:text.m3u"],
"makefile": ["scope:source.makefile"],
"markdown": ["scope:text.html.markdown"],
"mum": ["=xml"],
"perl": ["scope:source.perl"],
"php": ["scope:embedding.php", "scope:text.html.php"],
"postscript": ["scope:source.postscript"],
"powershell": ["scope:source.powershell"],
"python": ["scope:source.python"],
"rdf": ["=xml"],
"rst": ["scope:text.restructuredtext"],
"rtf": ["scope:text.rtf"],
"ruby": ["scope:source.ruby"],
"rust": ["scope:source.rust"],
"scala": ["scope:source.scala"],
"shell": ["scope:source.shell.bash"],
"smali": ["scope:source.smali"],
"sql": ["scope:source.sql"],
"svg": ["=xml"],
"vba": ["scope:source.vbs"],
"winregistry": ["scope:source.reg"],
"xml": ["scope:text.xml"],
"yaml": ["scope:source.yaml"]
},

///////////////////////////////////////
// Guesslang settings (experimental) //
/////////////////////////////////////////////////////////////////////////
Expand Down
4 changes: 4 additions & 0 deletions menus/Default.sublime-commands
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
"caption": "AutoSetSyntax: Toggle Log Panel",
"command": "auto_set_syntax_toggle_log_panel",
},
{
"caption": "AutoSetSyntax: Download Dependencies",
"command": "auto_set_syntax_download_dependencies",
},
{
"caption": "AutoSetSyntax: Download Guesslang Server",
"command": "auto_set_syntax_download_guesslang_server",
Expand Down
11 changes: 10 additions & 1 deletion plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import importlib.machinery
import pkgutil
import sys
from pathlib import Path

import sublime
Expand All @@ -11,11 +12,12 @@
AutoSetSyntaxCreateNewConstraintCommand,
AutoSetSyntaxCreateNewMatchCommand,
AutoSetSyntaxDebugInformationCommand,
AutoSetSyntaxDownloadDependenciesCommand,
AutoSetSyntaxDownloadGuesslangServerCommand,
AutoSetSyntaxRestartGuesslangCommand,
run_auto_set_syntax_on_view,
)
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME, PLUGIN_PY_LIBS_DIR
from .listener import (
AutoSetSyntaxEventListener,
AutoSetSyntaxTextChangeListener,
Expand Down Expand Up @@ -48,6 +50,7 @@
"AutoSetSyntaxCreateNewConstraintCommand",
"AutoSetSyntaxCreateNewMatchCommand",
"AutoSetSyntaxDebugInformationCommand",
"AutoSetSyntaxDownloadDependenciesCommand",
"AutoSetSyntaxDownloadGuesslangServerCommand",
"AutoSetSyntaxRestartGuesslangCommand",
# ST: listeners
Expand All @@ -67,6 +70,7 @@ def plugin_loaded() -> None:


def _plugin_loaded() -> None:
_add_python_lib_path()
_load_custom_implementations()

AioSettings.plugin_name = PLUGIN_NAME
Expand Down Expand Up @@ -98,6 +102,11 @@ def _settings_changed_callback(window: sublime.Window) -> None:
compile_rules(window, is_update=True)


def _add_python_lib_path() -> None:
if (path := str(PLUGIN_PY_LIBS_DIR)) not in sys.path:
sys.path.insert(0, path)


def _load_custom_implementations() -> None:
for finder, name, _ in pkgutil.iter_modules(map(str, PLUGIN_CUSTOM_MODULE_PATHS.values())):
assert isinstance(finder, importlib.machinery.FileFinder)
Expand Down
2 changes: 2 additions & 0 deletions plugin/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
AutoSetSyntaxCreateNewMatchCommand,
)
from .auto_set_syntax_debug_information import AutoSetSyntaxDebugInformationCommand
from .auto_set_syntax_download_dependencies import AutoSetSyntaxDownloadDependenciesCommand
from .auto_set_syntax_download_guesslang_server import AutoSetSyntaxDownloadGuesslangServerCommand
from .auto_set_syntax_restart_guesslang import AutoSetSyntaxRestartGuesslangCommand

Expand All @@ -13,6 +14,7 @@
"AutoSetSyntaxCreateNewConstraintCommand",
"AutoSetSyntaxCreateNewMatchCommand",
"AutoSetSyntaxDebugInformationCommand",
"AutoSetSyntaxDownloadDependenciesCommand",
"AutoSetSyntaxDownloadGuesslangServerCommand",
"AutoSetSyntaxRestartGuesslangCommand",
# ...
Expand Down
57 changes: 56 additions & 1 deletion plugin/commands/auto_set_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..constants import PLUGIN_NAME, RE_ST_SYNTAX_TEST_LINE, RE_VIM_SYNTAX_LINE
from ..guesslang.types import GuesslangServerPredictionItem, GuesslangServerResponse
from ..helpers import is_syntaxable_view
from ..helpers import is_syntaxable_view, resolve_magika_label_with_syntax_map
from ..libs import websocket
from ..logger import Logger
from ..rules import SyntaxRuleCollection
Expand Down Expand Up @@ -194,6 +194,17 @@ def run_auto_set_syntax_on_view(
} and _assign_syntax_with_trimmed_filename(view, event):
return True

if event in {
ListenerEvent.COMMAND,
ListenerEvent.INIT,
ListenerEvent.LOAD,
ListenerEvent.MODIFY,
ListenerEvent.PASTE,
ListenerEvent.SAVE,
ListenerEvent.UNTRANSIENTIZE,
} and _assign_syntax_with_magika(view, event):
return True

if _assign_syntax_with_heuristics(view, event):
return True

Expand Down Expand Up @@ -374,6 +385,50 @@ def is_json(view: sublime.View) -> bool:
return False


def _assign_syntax_with_magika(view: sublime.View, event: ListenerEvent | None = None) -> bool:
if not (
(window := view.window())
and (settings := get_merged_plugin_settings(window=window))
and settings.get("magika.enabled")
and (view_snapshot := G.view_snapshot_collection.get_by_view(view))
# don't apply on those have an extension
and (event == ListenerEvent.COMMAND or "." not in view_snapshot.file_name_unhidden)
# only apply on plain text syntax
and ((syntax := view_snapshot.syntax) and is_plaintext_syntax(syntax))
# we don't want to use AI model during typing when there is only one line
# that may result in unwanted behavior such as a new buffer may be assigned to Python
# right after "import" is typed but it could be JavaScript or TypeScript as well
and (event != ListenerEvent.MODIFY or "\n" in view_snapshot.content)
):
return False

try:
from magika import Magika
except ImportError as e:
Logger.log(f"💣 Error occured when importing Magika: {e}", window=window)
return False

classifier = Magika()
output = classifier.identify_bytes(view_snapshot.content.encode()).output
Logger.log(f"🐛 Magika's prediction: {output}", window=window)

threadshold: float = settings.get("magika.min_confidence", 0.0)
if output.score < threadshold or output.ct_label in {"directory", "empty", "txt", "unknown"}:
return False

syntax_map: dict[str, list[str]] = settings.get("magika.syntax_map", {})
if not (syntax_likes := resolve_magika_label_with_syntax_map(output.ct_label, syntax_map)):
Logger.log(f'🤔 Unknown "label" from Magika: {output.ct_label}', window=window)
return False

if not (syntax := find_syntax_by_syntax_likes(syntax_likes, include_plaintext=False)):
Logger.log(f"😢 Failed finding syntax from Magika: {syntax_likes}", window=window)
return False

sublime.status_message(f"Predicted syntax: {output.ct_label} ({round(output.score * 100, 2)}% confidence)")
return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"})


def _assign_syntax_with_guesslang_async(view: sublime.View, event: ListenerEvent | None = None) -> None:
if not (
G.guesslang_client
Expand Down
124 changes: 124 additions & 0 deletions plugin/commands/auto_set_syntax_download_dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from __future__ import annotations

import gzip
import tarfile
import threading
import urllib.request
import zipfile
from collections.abc import Iterable
from pathlib import Path
from typing import Union

import sublime
import sublime_plugin

from ..constants import PLUGIN_NAME, PLUGIN_PY_LIBS_DIR, PLUGIN_PY_LIBS_URL, PLUGIN_PY_LIBS_ZIP_NAME
from ..utils import rmtree_ex

PathLike = Union[Path, str]


class AutoSetSyntaxDownloadDependenciesCommand(sublime_plugin.ApplicationCommand):
# Dependencies are published on https://github.com/jfcherng-sublime/ST-AutoSetSyntax/tree/dependencies

def description(self) -> str:
return f"{PLUGIN_NAME}: Download Dependencies"

def run(self) -> None:
self.t = threading.Thread(target=self._worker)
self.t.start()

@classmethod
def _worker(cls) -> None:
sublime.message_dialog(f"[{PLUGIN_NAME}] Start downloading dependencies...")

cls._prepare_dependencies()

if not (magika_dir := PLUGIN_PY_LIBS_DIR / "magika").is_dir():
sublime.error_message(f"[{PLUGIN_NAME}] Cannot find magika: {str(magika_dir)}")

sublime.message_dialog(f"[{PLUGIN_NAME}] Finish downloading dependencies!")

@staticmethod
def _prepare_dependencies() -> None:
zip_path = PLUGIN_PY_LIBS_DIR.parent / PLUGIN_PY_LIBS_ZIP_NAME
rmtree_ex(PLUGIN_PY_LIBS_DIR, ignore_errors=True)
try:
download_file(PLUGIN_PY_LIBS_URL, zip_path)
except Exception as e:
sublime.error_message(f"[{PLUGIN_NAME}] Error while downloading: {PLUGIN_PY_LIBS_URL} ({e})")
decompress_file(zip_path)
zip_path.unlink(missing_ok=True)


def decompress_file(tarball: PathLike, dst_dir: PathLike | None = None) -> bool:
"""
Decompress the tarball.
:param tarball: The tarball
:param dst_dir: The destination directory
:returns: Successfully decompressed the tarball or not
"""

def tar_safe_extract(
tar: tarfile.TarFile,
path: PathLike = ".",
members: Iterable[tarfile.TarInfo] | None = None,
*,
numeric_owner: bool = False,
) -> None:
path = Path(path).resolve()
for member in tar.getmembers():
member_path = (path / member.name).resolve()
if path not in member_path.parents:
raise Exception("Attempted Path Traversal in Tar File")

tar.extractall(path, members, numeric_owner=numeric_owner)

tarball = Path(tarball)
dst_dir = Path(dst_dir) if dst_dir else tarball.parent
filename = tarball.name

try:
if filename.endswith(".tar.gz"):
with tarfile.open(tarball, "r:gz") as f_1:
tar_safe_extract(f_1, dst_dir)
return True

if filename.endswith(".tar"):
with tarfile.open(tarball, "r:") as f_2:
tar_safe_extract(f_2, dst_dir)
return True

if filename.endswith(".zip"):
with zipfile.ZipFile(tarball) as f_3:
f_3.extractall(dst_dir)
return True
except Exception:
pass
return False


def download_file(url: str, save_path: PathLike) -> None:
"""
Downloads a file.
:param url: The url
:param save_path: The path of the saved file
"""

save_path = Path(save_path)
save_path.unlink(missing_ok=True)
save_path.parent.mkdir(parents=True, exist_ok=True)
save_path.write_bytes(simple_urlopen(url))


def simple_urlopen(url: str, chunk_size: int = 512 * 1024) -> bytes:
response = urllib.request.urlopen(url)
data = b""
while chunk := response.read(chunk_size):
data += chunk
if response.info().get("Content-Encoding") == "gzip":
data = gzip.decompress(data)
return data
8 changes: 8 additions & 0 deletions plugin/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@

################################################################################

PLUGIN_PY_LIBS_DIR_NAME = f"libs-py38@{ST_PLATFORM_ARCH}"
PLUGIN_PY_LIBS_DIR = PLUGIN_STORAGE_DIR / PLUGIN_PY_LIBS_DIR_NAME
PLUGIN_PY_LIBS_ZIP_NAME = f"{PLUGIN_PY_LIBS_DIR_NAME}.zip"
PLUGIN_PY_LIBS_URL = "https://github.com/{repo}/raw/dependencies/{file}".format(
repo="jfcherng-sublime/ST-AutoSetSyntax",
file=PLUGIN_PY_LIBS_ZIP_NAME,
)

GUESSLANG_SERVER_TAG = "server-0.1.7"
GUESSLANG_SERVER_URL = "https://github.com/{repo}/archive/{ref}.zip".format(
repo="jfcherng-sublime/ST-AutoSetSyntax",
Expand Down
17 changes: 16 additions & 1 deletion plugin/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sublime

from .settings import get_st_setting
from .utils import is_plaintext_syntax, is_transient_view
from .utils import is_plaintext_syntax, is_transient_view, stable_unique


def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool:
Expand All @@ -15,3 +15,18 @@ def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool
and (not must_plaintext or ((syntax := view.syntax()) and is_plaintext_syntax(syntax)))
and ((size_max := get_st_setting("syntax_detection_size_limit", 0)) == 0 or size_max >= view.size())
)


def resolve_magika_label_with_syntax_map(label: str, syntax_map: dict[str, list[str]]) -> list[str]:
res: list[str] = []
queue: list[str] = syntax_map.get(label, []).copy()

# @todo what if there are circular references?
while queue:
syntax_like = queue.pop()
if syntax_like.startswith("="):
queue.extend(syntax_map.get(syntax_like[1:], []))
continue
res.append(syntax_like)

return list(stable_unique(reversed(res)))
Loading

0 comments on commit 4fbad39

Please sign in to comment.