diff --git a/AutoSetSyntax.sublime-settings b/AutoSetSyntax.sublime-settings index 121a5ddc..f185858b 100644 --- a/AutoSetSyntax.sublime-settings +++ b/AutoSetSyntax.sublime-settings @@ -643,11 +643,20 @@ } ], - /////////////////////////////////////// - // Guesslang settings (experimental) // - ///////////////////////////////////////////////////////////////////////// - // You have to restart ST after modifying any of guesslang's settings. // - ///////////////////////////////////////////////////////////////////////// + ///////////////////// + // Magika settings // + ///////////////////// + + // To use this feature, you have to install the "magika" library. + // @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/dl-based-syntax-detection/#prerequisites + "magika.enabled": true, + "magika.min_confidence": 0.85, + // @see https://github.com/google/magika/blob/9e733e847ea0d93ea100d5d478a4b54c3ec5fd1c/docs/supported-content-types-list.md + "magika.syntax_map": { + "rs": ["scope:source.rust"], + // ... + "rust": ["=rs"], + }, // To use this feature, you have to install the server. // @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/experimental/ml-based-syntax-detection/#prerequisites diff --git a/plugin/__init__.py b/plugin/__init__.py index 40bde3eb..14ed3972 100644 --- a/plugin/__init__.py +++ b/plugin/__init__.py @@ -1,6 +1,7 @@ import importlib import importlib.machinery import pkgutil +import sys from pathlib import Path import sublime @@ -15,7 +16,7 @@ AutoSetSyntaxRestartGuesslangCommand, run_auto_set_syntax_on_view, ) -from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME +from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME, PLUGIN_PY_LIBS_DIR from .listener import ( AutoSetSyntaxEventListener, AutoSetSyntaxTextChangeListener, @@ -67,6 +68,7 @@ def plugin_loaded() -> None: def _plugin_loaded() -> None: + _add_python_lib_path() _load_custom_implementations() AioSettings.plugin_name = PLUGIN_NAME @@ -98,6 +100,11 @@ def _settings_changed_callback(window: sublime.Window) -> None: compile_rules(window, is_update=True) +def _add_python_lib_path() -> None: + if (path := str(PLUGIN_PY_LIBS_DIR)) not in sys.path: + sys.path.insert(0, path) + + def _load_custom_implementations() -> None: for finder, name, _ in pkgutil.iter_modules(map(str, PLUGIN_CUSTOM_MODULE_PATHS.values())): assert isinstance(finder, importlib.machinery.FileFinder) diff --git a/plugin/commands/auto_set_syntax.py b/plugin/commands/auto_set_syntax.py index d039632f..3571dd3c 100644 --- a/plugin/commands/auto_set_syntax.py +++ b/plugin/commands/auto_set_syntax.py @@ -13,7 +13,7 @@ from ..constants import PLUGIN_NAME, RE_ST_SYNTAX_TEST_LINE, RE_VIM_SYNTAX_LINE from ..guesslang.types import GuesslangServerPredictionItem, GuesslangServerResponse -from ..helpers import is_syntaxable_view +from ..helpers import is_syntaxable_view, resolve_magika_label_with_syntax_map from ..libs import websocket from ..logger import Logger from ..rules import SyntaxRuleCollection @@ -194,6 +194,17 @@ def run_auto_set_syntax_on_view( } and _assign_syntax_with_trimmed_filename(view, event): return True + if event in { + ListenerEvent.COMMAND, + ListenerEvent.INIT, + ListenerEvent.LOAD, + ListenerEvent.MODIFY, + ListenerEvent.PASTE, + ListenerEvent.SAVE, + ListenerEvent.UNTRANSIENTIZE, + } and _assign_syntax_with_magika(view, event): + return True + if _assign_syntax_with_heuristics(view, event): return True @@ -374,6 +385,50 @@ def is_json(view: sublime.View) -> bool: return False +def _assign_syntax_with_magika(view: sublime.View, event: ListenerEvent | None = None) -> bool: + if not ( + (window := view.window()) + and (settings := get_merged_plugin_settings(window=window)) + and settings.get("magika.enabled") + and (view_snapshot := G.view_snapshot_collection.get_by_view(view)) + # don't apply on those have an extension + and (event == ListenerEvent.COMMAND or "." not in view_snapshot.file_name_unhidden) + # only apply on plain text syntax + and ((syntax := view_snapshot.syntax) and is_plaintext_syntax(syntax)) + # we don't want to use AI model during typing when there is only one line + # that may result in unwanted behavior such as a new buffer may be assigned to Python + # right after "import" is typed but it could be JavaScript or TypeScript as well + and (event != ListenerEvent.MODIFY or "\n" in view_snapshot.content) + ): + return False + + try: + from magika import Magika + except ImportError as e: + Logger.log(f"💣 Error occured when importing Magika: {e}", window=window) + return False + + classifier = Magika() + output = classifier.identify_bytes(view_snapshot.content.encode()).output + Logger.log(f"🐛 Magika's prediction: {output}", window=window) + + threadshold: float = settings.get("magika.min_confidence", 0.0) + if output.score < threadshold or output.ct_label in {"empty", "txt", "unknown"}: + return False + + syntax_map: dict[str, list[str]] = settings.get("magika.syntax_map", {}) + if not (syntax_likes := resolve_magika_label_with_syntax_map(output.ct_label, syntax_map)): + Logger.log(f'🤔 Unknown "label" from Magika: {output.ct_label}', window=window) + return False + + if not (syntax := find_syntax_by_syntax_likes(syntax_likes, include_plaintext=False)): + Logger.log(f"😢 Failed finding syntax from Magika: {syntax_likes}", window=window) + return False + + sublime.status_message(f"Predicted syntax: {syntax.name} ({round(output.score * 100, 2)}% confidence)") + return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"}) + + def _assign_syntax_with_guesslang_async(view: sublime.View, event: ListenerEvent | None = None) -> None: if not ( G.guesslang_client diff --git a/plugin/constants.py b/plugin/constants.py index bce2a959..c0f1c8ba 100644 --- a/plugin/constants.py +++ b/plugin/constants.py @@ -26,6 +26,7 @@ PLUGIN_NAME = __package__.partition(".")[0] # like "AutoSetSyntax" PLUGIN_STORAGE_DIR = Path(sublime.cache_path()).parent / f"Package Storage/{PLUGIN_NAME}" +PLUGIN_PY_LIBS_DIR = PLUGIN_STORAGE_DIR / f"libs-py38@{ST_PLATFORM_ARCH}" PLUGIN_CUSTOM_DIR = Path(sublime.packages_path()) / f"{PLUGIN_NAME}-Custom" PLUGIN_CUSTOM_MODULE_PATHS = { "constraint": PLUGIN_CUSTOM_DIR / "constraints", diff --git a/plugin/helpers.py b/plugin/helpers.py index 10520b6f..85fe59c4 100644 --- a/plugin/helpers.py +++ b/plugin/helpers.py @@ -3,7 +3,7 @@ import sublime from .settings import get_st_setting -from .utils import is_plaintext_syntax, is_transient_view +from .utils import is_plaintext_syntax, is_transient_view, stable_unique def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool: @@ -15,3 +15,17 @@ def is_syntaxable_view(view: sublime.View, must_plaintext: bool = False) -> bool and (not must_plaintext or ((syntax := view.syntax()) and is_plaintext_syntax(syntax))) and ((size_max := get_st_setting("syntax_detection_size_limit", 0)) == 0 or size_max >= view.size()) ) + + +def resolve_magika_label_with_syntax_map(label: str, syntax_map: dict[str, list[str]]) -> list[str]: + res: list[str] = [] + queue: list[str] = syntax_map.get(label, []).copy() + + while queue: + syntax_like = queue.pop() + if syntax_like.startswith("="): + queue.extend(syntax_map.get(syntax_like[1:], [])) + continue + res.append(syntax_like) + + return list(stable_unique(reversed(res))) diff --git a/tests/files/this-is-rust b/tests/files/this-is-rust new file mode 100644 index 00000000..53ea487e --- /dev/null +++ b/tests/files/this-is-rust @@ -0,0 +1,15 @@ +// This is a comment, and is ignored by the compiler. +// You can test this code by clicking the "Run" button over there -> +// or if you prefer to use your keyboard, you can use the "Ctrl + Enter" +// shortcut. + +// This code is editable, feel free to hack it! +// You can always return to the original code by clicking the "Reset" button -> + +// This is the main function. +fn main() { + // Statements here are executed when the compiled binary is called. + + // Print text to the console. + println!("Hello World!"); +} diff --git a/typings/magika/__init__.pyi b/typings/magika/__init__.pyi new file mode 100644 index 00000000..3c4eef3d --- /dev/null +++ b/typings/magika/__init__.pyi @@ -0,0 +1,5 @@ +from magika import magika as magika, prediction_mode as prediction_mode + +Magika = magika.Magika +MagikaError = magika.MagikaError +PredictionMode = prediction_mode.PredictionMode diff --git a/typings/magika/cli/magika.pyi b/typings/magika/cli/magika.pyi new file mode 100644 index 00000000..55e48598 --- /dev/null +++ b/typings/magika/cli/magika.pyi @@ -0,0 +1,19 @@ +from _typeshed import Incomplete +from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors +from magika.content_types import ContentTypesManager as ContentTypesManager +from magika.logger import get_logger as get_logger +from magika.types import FeedbackReport as FeedbackReport, MagikaResult as MagikaResult +from pathlib import Path +from typing import List, Optional + +VERSION: str +CONTACT_EMAIL: str +CONTEXT_SETTINGS: Incomplete +HELP_EPILOG: Incomplete + +def main(file: List[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, generate_report_flag: bool, output_version: bool, list_output_content_types: bool, model_dir: Optional[Path]) -> None: ... +def should_read_from_stdin(files_paths: List[Path]) -> bool: ... +def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: ... +def generate_feedback_report(magika: Magika, file_path: Path, magika_result: MagikaResult) -> FeedbackReport: ... +def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: ... +def print_output_content_types_list() -> None: ... diff --git a/typings/magika/colors.pyi b/typings/magika/colors.pyi new file mode 100644 index 00000000..1f9dc70e --- /dev/null +++ b/typings/magika/colors.pyi @@ -0,0 +1,17 @@ +BLACK: str +RED: str +GREEN: str +YELLOW: str +BLUE: str +PURPLE: str +CYAN: str +LIGHT_GRAY: str +DARK_GRAY: str +LIGHT_RED: str +LIGHT_GREEN: str +LIGHT_YELLOW: str +LIGHT_BLUE: str +LIGHT_PURPLE: str +LIGHT_CYAN: str +WHITE: str +RESET: str diff --git a/typings/magika/content_types.pyi b/typings/magika/content_types.pyi new file mode 100644 index 00000000..5f376e95 --- /dev/null +++ b/typings/magika/content_types.pyi @@ -0,0 +1,72 @@ +from _typeshed import Incomplete +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + +CONTENT_TYPES_CONFIG_PATH: Incomplete + +class ContentType: + UNKNOWN: str + UNKNOWN_MIME_TYPE: str + UNKNOWN_CONTENT_TYPE_GROUP: str + UNKNOWN_MAGIC: str + UNKNOWN_DESCRIPTION: str + UNSUPPORTED: str + ERROR: str + MISSING: str + EMPTY: str + CORRUPTED: str + TIMEOUT: str + NOT_VALID: str + FILE_DOES_NOT_EXIST: str + PERMISSION_ERROR: str + DIRECTORY: str + SYMLINK: str + GENERIC_TEXT: str + name: Incomplete + extensions: Incomplete + mime_type: Incomplete + group: Incomplete + magic: Incomplete + description: Incomplete + vt_type: Incomplete + datasets: Incomplete + parent: Incomplete + tags: Incomplete + model_target_label: Incomplete + target_label: Incomplete + correct_labels: Incomplete + in_scope_for_output_content_type: Incomplete + def __init__(self, name: str, extensions: List[str], mime_type: Optional[str], group: Optional[str], magic: Optional[str], description: Optional[str], vt_type: Optional[str], datasets: List[str], parent: Optional[str], tags: List[str], model_target_label: Optional[str], target_label: Optional[str], correct_labels: List[str], in_scope_for_output_content_type: bool, add_automatic_tags: bool = True) -> None: ... + @property + def is_text(self) -> bool: ... + @property + def in_scope_for_training(self) -> bool: ... + def to_dict(self) -> Dict[str, Any]: ... + @staticmethod + def from_dict(info_d: Dict, add_automatic_tags: bool = True) -> ContentType: ... + +class ContentTypesManager: + SPECIAL_CONTENT_TYPES: List[str] + SUPPORTED_TARGET_LABELS_SPEC: Incomplete + cts: Incomplete + tag2cts: Incomplete + ext2cts: Incomplete + def __init__(self, content_type_config_path: Path = ..., add_automatic_tags: bool = True) -> None: ... + def load_content_types_info(self, content_type_config_path: Path, add_automatic_tags: bool = True) -> None: ... + def get(self, content_type_name: str) -> Optional[ContentType]: ... + def get_or_raise(self, content_type_name: Optional[str]) -> ContentType: ... + def get_mime_type(self, content_type_name: str, default: str = ...) -> str: ... + def get_group(self, content_type_name: str, default: str = ...) -> str: ... + def get_magic(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ... + def get_description(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ... + def get_cts_by_ext(self, ext: str) -> List[ContentType]: ... + def get_cts_by_ext_or_raise(self, ext: str) -> List[ContentType]: ... + def get_valid_tags(self, only_explicit: bool = True) -> List[str]: ... + def is_valid_ct_label(self, label: str) -> bool: ... + def is_valid_tag(self, tag: str) -> bool: ... + def select(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[ContentType]: ... + def select_names(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[str]: ... + def get_content_types_space(self) -> List[str]: ... + def get_output_content_types(self) -> List[ContentType]: ... + def get_output_content_types_names(self) -> List[str]: ... + def get_invalid_labels(self, labels: Iterable[str]) -> List[str]: ... diff --git a/typings/magika/logger.pyi b/typings/magika/logger.pyi new file mode 100644 index 00000000..f1f3ce17 --- /dev/null +++ b/typings/magika/logger.pyi @@ -0,0 +1,17 @@ +from _typeshed import Incomplete +from magika import colors as colors +from typing import TextIO + +class SimpleLogger: + level: Incomplete + use_colors: Incomplete + def __init__(self, use_colors: bool = False) -> None: ... + def setLevel(self, level: int) -> None: ... + def raw_print_to_stdout(self, msg: str) -> None: ... + def raw_print(self, msg: str, file: TextIO = ...) -> None: ... + def debug(self, msg: str) -> None: ... + def info(self, msg: str) -> None: ... + def warning(self, msg: str) -> None: ... + def error(self, msg: str) -> None: ... + +def get_logger(use_colors: bool = False) -> SimpleLogger: ... diff --git a/typings/magika/magika.pyi b/typings/magika/magika.pyi new file mode 100644 index 00000000..c9873c78 --- /dev/null +++ b/typings/magika/magika.pyi @@ -0,0 +1,17 @@ +from magika.content_types import ContentType as ContentType, ContentTypesManager as ContentTypesManager +from magika.logger import get_logger as get_logger +from magika.prediction_mode import PredictionMode as PredictionMode +from magika.types import MagikaOutputFields as MagikaOutputFields, MagikaResult as MagikaResult, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, ModelOutputFields as ModelOutputFields +from pathlib import Path +from typing import List, Optional + +class Magika: + def __init__(self, model_dir: Optional[Path] = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ... + def identify_path(self, path: Path) -> MagikaResult: ... + def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: ... + def identify_bytes(self, content: bytes) -> MagikaResult: ... + @staticmethod + def get_default_model_name() -> str: ... + def get_model_name(self) -> str: ... + +class MagikaError(Exception): ... diff --git a/typings/magika/prediction_mode.pyi b/typings/magika/prediction_mode.pyi new file mode 100644 index 00000000..f3ec6210 --- /dev/null +++ b/typings/magika/prediction_mode.pyi @@ -0,0 +1,10 @@ +from _typeshed import Incomplete +from magika.strenum import LowerCaseStrEnum as LowerCaseStrEnum +from typing import List + +class PredictionMode(LowerCaseStrEnum): + BEST_GUESS: Incomplete + MEDIUM_CONFIDENCE: Incomplete + HIGH_CONFIDENCE: Incomplete + @staticmethod + def get_valid_prediction_modes() -> List[str]: ... diff --git a/typings/magika/strenum.pyi b/typings/magika/strenum.pyi new file mode 100644 index 00000000..05cab4e5 --- /dev/null +++ b/typings/magika/strenum.pyi @@ -0,0 +1,6 @@ +import enum + +class StrEnum(str, enum.Enum): + def __new__(cls, value: str | StrEnum, *args, **kwargs): ... + +class LowerCaseStrEnum(StrEnum): ... diff --git a/typings/magika/types.pyi b/typings/magika/types.pyi new file mode 100644 index 00000000..c55fbe89 --- /dev/null +++ b/typings/magika/types.pyi @@ -0,0 +1,49 @@ +from dataclasses import dataclass +from typing import List, Optional + +@dataclass +class ModelFeatures: + beg: List[int] + mid: List[int] + end: List[int] + def __init__(self, beg, mid, end) -> None: ... + +@dataclass +class ModelOutput: + ct_label: str + score: float + def __init__(self, ct_label, score) -> None: ... + +@dataclass +class MagikaResult: + path: str + dl: ModelOutputFields + output: MagikaOutputFields + def __init__(self, path, dl, output) -> None: ... + +@dataclass +class ModelOutputFields: + ct_label: Optional[str] + score: Optional[float] + group: Optional[str] + mime_type: Optional[str] + magic: Optional[str] + description: Optional[str] + def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ... + +@dataclass +class MagikaOutputFields: + ct_label: str + score: float + group: str + mime_type: str + magic: str + description: str + def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ... + +@dataclass +class FeedbackReport: + hash: str + features: ModelFeatures + result: MagikaResult + def __init__(self, hash, features, result) -> None: ...