Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Cherng <[email protected]>
  • Loading branch information
jfcherng committed Feb 18, 2024
1 parent 26a6fef commit 9cff43d
Show file tree
Hide file tree
Showing 15 changed files with 321 additions and 8 deletions.
19 changes: 14 additions & 5 deletions AutoSetSyntax.sublime-settings
Original file line number Diff line number Diff line change
Expand Up @@ -643,11 +643,20 @@
}
],

///////////////////////////////////////
// Guesslang settings (experimental) //
/////////////////////////////////////////////////////////////////////////
// You have to restart ST after modifying any of guesslang's settings. //
/////////////////////////////////////////////////////////////////////////
/////////////////////
// Magika settings //
/////////////////////

// To use this feature, you have to install the "magika" library.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/dl-based-syntax-detection/#prerequisites
"magika.enabled": true,
"magika.min_confidence": 0.85,
// @see https://github.com/google/magika/blob/9e733e847ea0d93ea100d5d478a4b54c3ec5fd1c/docs/supported-content-types-list.md
"magika.syntax_map": {
"rs": ["scope:source.rust"],
// ...
"rust": ["=rs"],
},

// To use this feature, you have to install the server.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/experimental/ml-based-syntax-detection/#prerequisites
Expand Down
9 changes: 8 additions & 1 deletion plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import importlib.machinery
import pkgutil
import sys
from pathlib import Path

import sublime
Expand All @@ -15,7 +16,7 @@
AutoSetSyntaxRestartGuesslangCommand,
run_auto_set_syntax_on_view,
)
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME, PLUGIN_PY_LIBS_DIR
from .listener import (
AutoSetSyntaxEventListener,
AutoSetSyntaxTextChangeListener,
Expand Down Expand Up @@ -67,6 +68,7 @@ def plugin_loaded() -> None:


def _plugin_loaded() -> None:
_add_python_lib_path()
_load_custom_implementations()

AioSettings.plugin_name = PLUGIN_NAME
Expand Down Expand Up @@ -98,6 +100,11 @@ def _settings_changed_callback(window: sublime.Window) -> None:
compile_rules(window, is_update=True)


def _add_python_lib_path() -> None:
if (path := str(PLUGIN_PY_LIBS_DIR)) not in sys.path:
sys.path.insert(0, path)


def _load_custom_implementations() -> None:
for finder, name, _ in pkgutil.iter_modules(map(str, PLUGIN_CUSTOM_MODULE_PATHS.values())):
assert isinstance(finder, importlib.machinery.FileFinder)
Expand Down
57 changes: 56 additions & 1 deletion plugin/commands/auto_set_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..constants import PLUGIN_NAME, RE_ST_SYNTAX_TEST_LINE, RE_VIM_SYNTAX_LINE
from ..guesslang.types import GuesslangServerPredictionItem, GuesslangServerResponse
from ..helpers import is_syntaxable_view
from ..helpers import is_syntaxable_view, resolve_magika_label_with_syntax_map
from ..libs import websocket
from ..logger import Logger
from ..rules import SyntaxRuleCollection
Expand Down Expand Up @@ -194,6 +194,17 @@ def run_auto_set_syntax_on_view(
} and _assign_syntax_with_trimmed_filename(view, event):
return True

if event in {
ListenerEvent.COMMAND,
ListenerEvent.INIT,
ListenerEvent.LOAD,
ListenerEvent.MODIFY,
ListenerEvent.PASTE,
ListenerEvent.SAVE,
ListenerEvent.UNTRANSIENTIZE,
} and _assign_syntax_with_magika(view, event):
return True

if _assign_syntax_with_heuristics(view, event):
return True

Expand Down Expand Up @@ -374,6 +385,50 @@ def is_json(view: sublime.View) -> bool:
return False


def _assign_syntax_with_magika(view: sublime.View, event: ListenerEvent | None = None) -> bool:
if not (
(window := view.window())
and (settings := get_merged_plugin_settings(window=window))
and settings.get("magika.enabled")
and (view_snapshot := G.view_snapshot_collection.get_by_view(view))
# don't apply on those have an extension
and (event == ListenerEvent.COMMAND or "." not in view_snapshot.file_name_unhidden)
# only apply on plain text syntax
and ((syntax := view_snapshot.syntax) and is_plaintext_syntax(syntax))
# we don't want to use AI model during typing when there is only one line
# that may result in unwanted behavior such as a new buffer may be assigned to Python
# right after "import" is typed but it could be JavaScript or TypeScript as well
and (event != ListenerEvent.MODIFY or "\n" in view_snapshot.content)
):
return False

try:
from magika import Magika
except ImportError as e:
Logger.log(f"💣 Error occured when importing Magika: {e}", window=window)
return False

classifier = Magika()
output = classifier.identify_bytes(view_snapshot.content.encode()).output
Logger.log(f"🐛 Magika's prediction: {output}", window=window)

threadshold: float = settings.get("magika.min_confidence", 0.0)
if output.score < threadshold or output.ct_label in {"empty", "txt", "unknown"}:
return False

syntax_map: dict[str, list[str]] = settings.get("magika.syntax_map", {})
if not (syntax_likes := resolve_magika_label_with_syntax_map(output.ct_label, syntax_map)):
Logger.log(f'🤔 Unknown "label" from Magika: {output.ct_label}', window=window)
return False

if not (syntax := find_syntax_by_syntax_likes(syntax_likes, include_plaintext=False)):
Logger.log(f"😢 Failed finding syntax from Magika: {syntax_likes}", window=window)
return False

sublime.status_message(f"Predicted syntax: {syntax.name} ({round(output.score * 100, 2)}% confidence)")
return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"})


def _assign_syntax_with_guesslang_async(view: sublime.View, event: ListenerEvent | None = None) -> None:
if not (
G.guesslang_client
Expand Down
1 change: 1 addition & 0 deletions plugin/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
PLUGIN_NAME = __package__.partition(".")[0] # like "AutoSetSyntax"

PLUGIN_STORAGE_DIR = Path(sublime.cache_path()).parent / f"Package Storage/{PLUGIN_NAME}"
PLUGIN_PY_LIBS_DIR = PLUGIN_STORAGE_DIR / f"libs-py38@{ST_PLATFORM_ARCH}"
PLUGIN_CUSTOM_DIR = Path(sublime.packages_path()) / f"{PLUGIN_NAME}-Custom"
PLUGIN_CUSTOM_MODULE_PATHS = {
"constraint": PLUGIN_CUSTOM_DIR / "constraints",
Expand Down
16 changes: 15 additions & 1 deletion plugin/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sublime

from .settings import get_st_setting
from .utils import is_plaintext_syntax, is_transient_view
from .utils import is_plaintext_syntax, is_transient_view, stable_unique


def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool:
Expand All @@ -15,3 +15,17 @@ def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool
and (not must_plaintext or ((syntax := view.syntax()) and is_plaintext_syntax(syntax)))
and ((size_max := get_st_setting("syntax_detection_size_limit", 0)) == 0 or size_max >= view.size())
)


def resolve_magika_label_with_syntax_map(label: str, syntax_map: dict[str, list[str]]) -> list[str]:
res: list[str] = []
queue: list[str] = syntax_map.get(label, []).copy()

while queue:
syntax_like = queue.pop()
if syntax_like.startswith("="):
queue.extend(syntax_map.get(syntax_like[1:], []))
continue
res.append(syntax_like)

return list(stable_unique(reversed(res)))
15 changes: 15 additions & 0 deletions tests/files/this-is-rust
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// This is a comment, and is ignored by the compiler.
// You can test this code by clicking the "Run" button over there ->
// or if you prefer to use your keyboard, you can use the "Ctrl + Enter"
// shortcut.

// This code is editable, feel free to hack it!
// You can always return to the original code by clicking the "Reset" button ->

// This is the main function.
fn main() {
// Statements here are executed when the compiled binary is called.

// Print text to the console.
println!("Hello World!");
}
5 changes: 5 additions & 0 deletions typings/magika/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from magika import magika as magika, prediction_mode as prediction_mode

Magika = magika.Magika
MagikaError = magika.MagikaError
PredictionMode = prediction_mode.PredictionMode
19 changes: 19 additions & 0 deletions typings/magika/cli/magika.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from _typeshed import Incomplete
from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors
from magika.content_types import ContentTypesManager as ContentTypesManager
from magika.logger import get_logger as get_logger
from magika.types import FeedbackReport as FeedbackReport, MagikaResult as MagikaResult
from pathlib import Path
from typing import List, Optional

VERSION: str
CONTACT_EMAIL: str
CONTEXT_SETTINGS: Incomplete
HELP_EPILOG: Incomplete

def main(file: List[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, generate_report_flag: bool, output_version: bool, list_output_content_types: bool, model_dir: Optional[Path]) -> None: ...
def should_read_from_stdin(files_paths: List[Path]) -> bool: ...
def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: ...
def generate_feedback_report(magika: Magika, file_path: Path, magika_result: MagikaResult) -> FeedbackReport: ...
def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: ...
def print_output_content_types_list() -> None: ...
17 changes: 17 additions & 0 deletions typings/magika/colors.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
BLACK: str
RED: str
GREEN: str
YELLOW: str
BLUE: str
PURPLE: str
CYAN: str
LIGHT_GRAY: str
DARK_GRAY: str
LIGHT_RED: str
LIGHT_GREEN: str
LIGHT_YELLOW: str
LIGHT_BLUE: str
LIGHT_PURPLE: str
LIGHT_CYAN: str
WHITE: str
RESET: str
72 changes: 72 additions & 0 deletions typings/magika/content_types.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from _typeshed import Incomplete
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional

CONTENT_TYPES_CONFIG_PATH: Incomplete

class ContentType:
UNKNOWN: str
UNKNOWN_MIME_TYPE: str
UNKNOWN_CONTENT_TYPE_GROUP: str
UNKNOWN_MAGIC: str
UNKNOWN_DESCRIPTION: str
UNSUPPORTED: str
ERROR: str
MISSING: str
EMPTY: str
CORRUPTED: str
TIMEOUT: str
NOT_VALID: str
FILE_DOES_NOT_EXIST: str
PERMISSION_ERROR: str
DIRECTORY: str
SYMLINK: str
GENERIC_TEXT: str
name: Incomplete
extensions: Incomplete
mime_type: Incomplete
group: Incomplete
magic: Incomplete
description: Incomplete
vt_type: Incomplete
datasets: Incomplete
parent: Incomplete
tags: Incomplete
model_target_label: Incomplete
target_label: Incomplete
correct_labels: Incomplete
in_scope_for_output_content_type: Incomplete
def __init__(self, name: str, extensions: List[str], mime_type: Optional[str], group: Optional[str], magic: Optional[str], description: Optional[str], vt_type: Optional[str], datasets: List[str], parent: Optional[str], tags: List[str], model_target_label: Optional[str], target_label: Optional[str], correct_labels: List[str], in_scope_for_output_content_type: bool, add_automatic_tags: bool = True) -> None: ...
@property
def is_text(self) -> bool: ...
@property
def in_scope_for_training(self) -> bool: ...
def to_dict(self) -> Dict[str, Any]: ...
@staticmethod
def from_dict(info_d: Dict, add_automatic_tags: bool = True) -> ContentType: ...

class ContentTypesManager:
SPECIAL_CONTENT_TYPES: List[str]
SUPPORTED_TARGET_LABELS_SPEC: Incomplete
cts: Incomplete
tag2cts: Incomplete
ext2cts: Incomplete
def __init__(self, content_type_config_path: Path = ..., add_automatic_tags: bool = True) -> None: ...
def load_content_types_info(self, content_type_config_path: Path, add_automatic_tags: bool = True) -> None: ...
def get(self, content_type_name: str) -> Optional[ContentType]: ...
def get_or_raise(self, content_type_name: Optional[str]) -> ContentType: ...
def get_mime_type(self, content_type_name: str, default: str = ...) -> str: ...
def get_group(self, content_type_name: str, default: str = ...) -> str: ...
def get_magic(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ...
def get_description(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ...
def get_cts_by_ext(self, ext: str) -> List[ContentType]: ...
def get_cts_by_ext_or_raise(self, ext: str) -> List[ContentType]: ...
def get_valid_tags(self, only_explicit: bool = True) -> List[str]: ...
def is_valid_ct_label(self, label: str) -> bool: ...
def is_valid_tag(self, tag: str) -> bool: ...
def select(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[ContentType]: ...
def select_names(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[str]: ...
def get_content_types_space(self) -> List[str]: ...
def get_output_content_types(self) -> List[ContentType]: ...
def get_output_content_types_names(self) -> List[str]: ...
def get_invalid_labels(self, labels: Iterable[str]) -> List[str]: ...
17 changes: 17 additions & 0 deletions typings/magika/logger.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from _typeshed import Incomplete
from magika import colors as colors
from typing import TextIO

class SimpleLogger:
level: Incomplete
use_colors: Incomplete
def __init__(self, use_colors: bool = False) -> None: ...
def setLevel(self, level: int) -> None: ...
def raw_print_to_stdout(self, msg: str) -> None: ...
def raw_print(self, msg: str, file: TextIO = ...) -> None: ...
def debug(self, msg: str) -> None: ...
def info(self, msg: str) -> None: ...
def warning(self, msg: str) -> None: ...
def error(self, msg: str) -> None: ...

def get_logger(use_colors: bool = False) -> SimpleLogger: ...
17 changes: 17 additions & 0 deletions typings/magika/magika.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from magika.content_types import ContentType as ContentType, ContentTypesManager as ContentTypesManager
from magika.logger import get_logger as get_logger
from magika.prediction_mode import PredictionMode as PredictionMode
from magika.types import MagikaOutputFields as MagikaOutputFields, MagikaResult as MagikaResult, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, ModelOutputFields as ModelOutputFields
from pathlib import Path
from typing import List, Optional

class Magika:
def __init__(self, model_dir: Optional[Path] = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ...
def identify_path(self, path: Path) -> MagikaResult: ...
def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: ...
def identify_bytes(self, content: bytes) -> MagikaResult: ...
@staticmethod
def get_default_model_name() -> str: ...
def get_model_name(self) -> str: ...

class MagikaError(Exception): ...
10 changes: 10 additions & 0 deletions typings/magika/prediction_mode.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from _typeshed import Incomplete
from magika.strenum import LowerCaseStrEnum as LowerCaseStrEnum
from typing import List

class PredictionMode(LowerCaseStrEnum):
BEST_GUESS: Incomplete
MEDIUM_CONFIDENCE: Incomplete
HIGH_CONFIDENCE: Incomplete
@staticmethod
def get_valid_prediction_modes() -> List[str]: ...
6 changes: 6 additions & 0 deletions typings/magika/strenum.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import enum

class StrEnum(str, enum.Enum):
def __new__(cls, value: str | StrEnum, *args, **kwargs): ...

class LowerCaseStrEnum(StrEnum): ...
Loading

0 comments on commit 9cff43d

Please sign in to comment.