Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Cherng <[email protected]>
  • Loading branch information
jfcherng committed Feb 17, 2024
1 parent e502ed0 commit 6d89e57
Show file tree
Hide file tree
Showing 12 changed files with 254 additions and 1 deletion.
9 changes: 8 additions & 1 deletion plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import importlib.machinery
import pkgutil
import sys
from pathlib import Path

import sublime
Expand All @@ -15,7 +16,7 @@
AutoSetSyntaxRestartGuesslangCommand,
run_auto_set_syntax_on_view,
)
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME, PLUGIN_PY_LIBS_DIR
from .listener import (
AutoSetSyntaxEventListener,
AutoSetSyntaxTextChangeListener,
Expand Down Expand Up @@ -67,6 +68,7 @@ def plugin_loaded() -> None:


def _plugin_loaded() -> None:
_add_python_lib_path()
_load_custom_implementations()

AioSettings.plugin_name = PLUGIN_NAME
Expand Down Expand Up @@ -98,6 +100,11 @@ def _settings_changed_callback(window: sublime.Window) -> None:
compile_rules(window, is_update=True)


def _add_python_lib_path() -> None:
if (path := str(PLUGIN_PY_LIBS_DIR)) not in sys.path:
sys.path.insert(0, path)


def _load_custom_implementations() -> None:
for finder, name, _ in pkgutil.iter_modules(map(str, PLUGIN_CUSTOM_MODULE_PATHS.values())):
assert isinstance(finder, importlib.machinery.FileFinder)
Expand Down
33 changes: 33 additions & 0 deletions plugin/commands/auto_set_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,39 @@ def is_json(view: sublime.View) -> bool:
return False


def _assign_syntax_with_magika(view: sublime.View, event: ListenerEvent | None = None) -> bool:
if not (
(view_snapshot := G.view_snapshot_collection.get_by_view(view))
# don't apply on those have an extension
and (event == ListenerEvent.COMMAND or "." not in view_snapshot.file_name_unhidden)
# only apply on plain text syntax
and ((syntax := view_snapshot.syntax) and is_plaintext_syntax(syntax))
# we don't want to use AI model during typing when there is only one line
# that may result in unwanted behavior such as a new buffer may be assigned to Python
# right after "import" is typed but it could be JavaScript or TypeScript as well
and (event != ListenerEvent.MODIFY or "\n" in view_snapshot.content)
):
return False

try:
from magika import Magika
except ImportError:
return False

threadshold = 0.5 # @todo: configurable

model = Magika()
output = model.identify_bytes(view_snapshot.content.encode()).output
Logger.log(f"🐛 Magika's prediction: {output}")

if output.score < threadshold or output.ct_label in {"empty", "txt", "unknown"}:
return False

# @see https://github.com/google/magika/blob/9e733e847ea0d93ea100d5d478a4b54c3ec5fd1c/docs/supported-content-types-list.md
label = output.ct_label
return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"})


def _assign_syntax_with_guesslang_async(view: sublime.View, event: ListenerEvent | None = None) -> None:
if not (
G.guesslang_client
Expand Down
1 change: 1 addition & 0 deletions plugin/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
PLUGIN_NAME = __package__.partition(".")[0] # like "AutoSetSyntax"

PLUGIN_STORAGE_DIR = Path(sublime.cache_path()).parent / f"Package Storage/{PLUGIN_NAME}"
PLUGIN_PY_LIBS_DIR = PLUGIN_STORAGE_DIR / f"libs-py38@{ST_PLATFORM_ARCH}"
PLUGIN_CUSTOM_DIR = Path(sublime.packages_path()) / f"{PLUGIN_NAME}-Custom"
PLUGIN_CUSTOM_MODULE_PATHS = {
"constraint": PLUGIN_CUSTOM_DIR / "constraints",
Expand Down
5 changes: 5 additions & 0 deletions typings/magika/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from magika import magika as magika, prediction_mode as prediction_mode

Magika = magika.Magika
MagikaError = magika.MagikaError
PredictionMode = prediction_mode.PredictionMode
19 changes: 19 additions & 0 deletions typings/magika/cli/magika.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from _typeshed import Incomplete
from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors
from magika.content_types import ContentTypesManager as ContentTypesManager
from magika.logger import get_logger as get_logger
from magika.types import FeedbackReport as FeedbackReport, MagikaResult as MagikaResult
from pathlib import Path
from typing import List, Optional

VERSION: str
CONTACT_EMAIL: str
CONTEXT_SETTINGS: Incomplete
HELP_EPILOG: Incomplete

def main(file: List[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, generate_report_flag: bool, output_version: bool, list_output_content_types: bool, model_dir: Optional[Path]) -> None: ...
def should_read_from_stdin(files_paths: List[Path]) -> bool: ...
def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: ...
def generate_feedback_report(magika: Magika, file_path: Path, magika_result: MagikaResult) -> FeedbackReport: ...
def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: ...
def print_output_content_types_list() -> None: ...
17 changes: 17 additions & 0 deletions typings/magika/colors.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
BLACK: str
RED: str
GREEN: str
YELLOW: str
BLUE: str
PURPLE: str
CYAN: str
LIGHT_GRAY: str
DARK_GRAY: str
LIGHT_RED: str
LIGHT_GREEN: str
LIGHT_YELLOW: str
LIGHT_BLUE: str
LIGHT_PURPLE: str
LIGHT_CYAN: str
WHITE: str
RESET: str
72 changes: 72 additions & 0 deletions typings/magika/content_types.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from _typeshed import Incomplete
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional

CONTENT_TYPES_CONFIG_PATH: Incomplete

class ContentType:
UNKNOWN: str
UNKNOWN_MIME_TYPE: str
UNKNOWN_CONTENT_TYPE_GROUP: str
UNKNOWN_MAGIC: str
UNKNOWN_DESCRIPTION: str
UNSUPPORTED: str
ERROR: str
MISSING: str
EMPTY: str
CORRUPTED: str
TIMEOUT: str
NOT_VALID: str
FILE_DOES_NOT_EXIST: str
PERMISSION_ERROR: str
DIRECTORY: str
SYMLINK: str
GENERIC_TEXT: str
name: Incomplete
extensions: Incomplete
mime_type: Incomplete
group: Incomplete
magic: Incomplete
description: Incomplete
vt_type: Incomplete
datasets: Incomplete
parent: Incomplete
tags: Incomplete
model_target_label: Incomplete
target_label: Incomplete
correct_labels: Incomplete
in_scope_for_output_content_type: Incomplete
def __init__(self, name: str, extensions: List[str], mime_type: Optional[str], group: Optional[str], magic: Optional[str], description: Optional[str], vt_type: Optional[str], datasets: List[str], parent: Optional[str], tags: List[str], model_target_label: Optional[str], target_label: Optional[str], correct_labels: List[str], in_scope_for_output_content_type: bool, add_automatic_tags: bool = True) -> None: ...
@property
def is_text(self) -> bool: ...
@property
def in_scope_for_training(self) -> bool: ...
def to_dict(self) -> Dict[str, Any]: ...
@staticmethod
def from_dict(info_d: Dict, add_automatic_tags: bool = True) -> ContentType: ...

class ContentTypesManager:
SPECIAL_CONTENT_TYPES: List[str]
SUPPORTED_TARGET_LABELS_SPEC: Incomplete
cts: Incomplete
tag2cts: Incomplete
ext2cts: Incomplete
def __init__(self, content_type_config_path: Path = ..., add_automatic_tags: bool = True) -> None: ...
def load_content_types_info(self, content_type_config_path: Path, add_automatic_tags: bool = True) -> None: ...
def get(self, content_type_name: str) -> Optional[ContentType]: ...
def get_or_raise(self, content_type_name: Optional[str]) -> ContentType: ...
def get_mime_type(self, content_type_name: str, default: str = ...) -> str: ...
def get_group(self, content_type_name: str, default: str = ...) -> str: ...
def get_magic(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ...
def get_description(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ...
def get_cts_by_ext(self, ext: str) -> List[ContentType]: ...
def get_cts_by_ext_or_raise(self, ext: str) -> List[ContentType]: ...
def get_valid_tags(self, only_explicit: bool = True) -> List[str]: ...
def is_valid_ct_label(self, label: str) -> bool: ...
def is_valid_tag(self, tag: str) -> bool: ...
def select(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[ContentType]: ...
def select_names(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[str]: ...
def get_content_types_space(self) -> List[str]: ...
def get_output_content_types(self) -> List[ContentType]: ...
def get_output_content_types_names(self) -> List[str]: ...
def get_invalid_labels(self, labels: Iterable[str]) -> List[str]: ...
17 changes: 17 additions & 0 deletions typings/magika/logger.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from _typeshed import Incomplete
from magika import colors as colors
from typing import TextIO

class SimpleLogger:
level: Incomplete
use_colors: Incomplete
def __init__(self, use_colors: bool = False) -> None: ...
def setLevel(self, level: int) -> None: ...
def raw_print_to_stdout(self, msg: str) -> None: ...
def raw_print(self, msg: str, file: TextIO = ...) -> None: ...
def debug(self, msg: str) -> None: ...
def info(self, msg: str) -> None: ...
def warning(self, msg: str) -> None: ...
def error(self, msg: str) -> None: ...

def get_logger(use_colors: bool = False) -> SimpleLogger: ...
17 changes: 17 additions & 0 deletions typings/magika/magika.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from magika.content_types import ContentType as ContentType, ContentTypesManager as ContentTypesManager
from magika.logger import get_logger as get_logger
from magika.prediction_mode import PredictionMode as PredictionMode
from magika.types import MagikaOutputFields as MagikaOutputFields, MagikaResult as MagikaResult, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, ModelOutputFields as ModelOutputFields
from pathlib import Path
from typing import List, Optional

class Magika:
def __init__(self, model_dir: Optional[Path] = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ...
def identify_path(self, path: Path) -> MagikaResult: ...
def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: ...
def identify_bytes(self, content: bytes) -> MagikaResult: ...
@staticmethod
def get_default_model_name() -> str: ...
def get_model_name(self) -> str: ...

class MagikaError(Exception): ...
10 changes: 10 additions & 0 deletions typings/magika/prediction_mode.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from _typeshed import Incomplete
from magika.strenum import LowerCaseStrEnum as LowerCaseStrEnum
from typing import List

class PredictionMode(LowerCaseStrEnum):
BEST_GUESS: Incomplete
MEDIUM_CONFIDENCE: Incomplete
HIGH_CONFIDENCE: Incomplete
@staticmethod
def get_valid_prediction_modes() -> List[str]: ...
6 changes: 6 additions & 0 deletions typings/magika/strenum.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import enum

class StrEnum(str, enum.Enum):
def __new__(cls, value: str | StrEnum, *args, **kwargs): ...

class LowerCaseStrEnum(StrEnum): ...
49 changes: 49 additions & 0 deletions typings/magika/types.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class ModelFeatures:
beg: List[int]
mid: List[int]
end: List[int]
def __init__(self, beg, mid, end) -> None: ...

@dataclass
class ModelOutput:
ct_label: str
score: float
def __init__(self, ct_label, score) -> None: ...

@dataclass
class MagikaResult:
path: str
dl: ModelOutputFields
output: MagikaOutputFields
def __init__(self, path, dl, output) -> None: ...

@dataclass
class ModelOutputFields:
ct_label: Optional[str]
score: Optional[float]
group: Optional[str]
mime_type: Optional[str]
magic: Optional[str]
description: Optional[str]
def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ...

@dataclass
class MagikaOutputFields:
ct_label: str
score: float
group: str
mime_type: str
magic: str
description: str
def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ...

@dataclass
class FeedbackReport:
hash: str
features: ModelFeatures
result: MagikaResult
def __init__(self, hash, features, result) -> None: ...

0 comments on commit 6d89e57

Please sign in to comment.