Skip to content

Commit

Permalink
refactor: use Magika v2 model
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Cherng <[email protected]>
  • Loading branch information
jfcherng committed Nov 4, 2024
1 parent 1b0fd96 commit da682ef
Show file tree
Hide file tree
Showing 20 changed files with 585 additions and 185 deletions.
71 changes: 66 additions & 5 deletions AutoSetSyntax.sublime-settings
Original file line number Diff line number Diff line change
Expand Up @@ -694,9 +694,10 @@
// To use this feature, you have to install the "magika" library.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/experimental/dl-based-syntax-detection/#prerequisites
"magika.enabled": false,
"magika.min_confidence": 0.85,
"magika.min_confidence": 0.5,
// To list supported file types, run shell command: `$ magika --list-output-content-types`
// @see https://github.com/google/magika/blob/main/docs/supported_content_types_list.md
"magika.syntax_map.ada": ["scope:source.ada"],
"magika.syntax_map.appleplist": ["scope:text.xml.plist", "=xml"],
"magika.syntax_map.asm": [
// no good way to do this?
Expand All @@ -706,42 +707,102 @@
"scope:source.assembly"
],
"magika.syntax_map.asp": ["scope:source.asp"],
"magika.syntax_map.autohotkey": ["scope:source.ahk"],
"magika.syntax_map.autoit": ["scope:source.autoit"],
"magika.syntax_map.awk": ["scope:source.awk"],
"magika.syntax_map.batch": ["scope:source.dosbatch"],
"magika.syntax_map.c": ["scope:source.c++" /* magika can't distinguish between C and C++ */],
"magika.syntax_map.bazel": ["scope:source.bazel", "=python"],
"magika.syntax_map.brainfuck": ["scope:source.bf"],
"magika.syntax_map.c": ["scope:source.c"],
"magika.syntax_map.clojure": ["scope:source.clojure"],
"magika.syntax_map.cmake": ["scope:source.cmake"],
"magika.syntax_map.cobol": ["scope:source.cobol"],
"magika.syntax_map.coffeescript": ["scope:source.coffee"],
"magika.syntax_map.cpp": ["scope:source.c++"],
"magika.syntax_map.cs": ["scope:source.cs"],
"magika.syntax_map.css": ["scope:source.scss", "scope:source.css"],
"magika.syntax_map.css": ["scope:source.css"],
"magika.syntax_map.csv": ["scope:text.advanced_csv", "scope:text.csv"],
"magika.syntax_map.dart": ["scope:source.dart"],
"magika.syntax_map.diff": ["scope:source.diff"],
"magika.syntax_map.dm": ["scope:source.dm"],
"magika.syntax_map.dockerfile": ["scope:source.containerfile", "scope:source.dockerfile"],
"magika.syntax_map.elixir": ["scope:source.elixir"],
"magika.syntax_map.eml": ["scope:source.eml"],
"magika.syntax_map.erb": ["scope:source.ruby.rails"],
"magika.syntax_map.erlang": ["scope:source.erlang"],
"magika.syntax_map.fortran": ["scope:source.fortran", "scope:source.modern-fortran"],
"magika.syntax_map.gemfile": ["=ruby"],
"magika.syntax_map.gitattributes": ["scope:text.git.attributes"],
"magika.syntax_map.gleam": ["scope:source.gleam"],
"magika.syntax_map.go": ["scope:source.go"],
"magika.syntax_map.gradle": ["scope:source.gradle"],
"magika.syntax_map.groovy": ["scope:source.groovy"],
"magika.syntax_map.h": ["=c"],
"magika.syntax_map.handlebars": ["scope:text.html.handlebars"],
"magika.syntax_map.haskell": ["scope:source.haskell"],
"magika.syntax_map.hpp": ["=cpp"],
"magika.syntax_map.htaccess": ["source.apacheconf"],
"magika.syntax_map.html": ["scope:text.html.basic"],
"magika.syntax_map.ini": ["scope:source.ini"],
"magika.syntax_map.ipynb": ["=json"],
"magika.syntax_map.java": ["scope:source.java"],
"magika.syntax_map.javascript": ["scope:source.ts" /* magika can't distinguish between TypeScript and JavaScript */],
"magika.syntax_map.javascript": ["scope:source.js"],
"magika.syntax_map.jinja": ["scope:text.jinja", "scope:text.html.jinja"],
"magika.syntax_map.json": ["scope:source.json"],
"magika.syntax_map.jsonc": ["scope:source.json"],
"magika.syntax_map.jsx": ["scope:source.jsx"],
"magika.syntax_map.julia": ["scope:source.julia"],
"magika.syntax_map.kotlin": ["scope:source.kotlin"],
"magika.syntax_map.latex": ["scope:text.tex.latex"],
"magika.syntax_map.license": ["=markdown"],
"magika.syntax_map.lisp": ["scope:source.lisp"],
"magika.syntax_map.lua": ["scope:source.lua"],
"magika.syntax_map.m3u": ["scope:text.m3u"],
"magika.syntax_map.makefile": ["scope:source.makefile"],
"magika.syntax_map.markdown": ["scope:text.html.markdown"],
"magika.syntax_map.matlab": ["scope:source.matlab"],
"magika.syntax_map.mum": ["=xml"],
"magika.syntax_map.ocaml": ["scope:source.ocaml"],
"magika.syntax_map.odin": ["scope:source.odin"],
"magika.syntax_map.pascal": ["scope:source.pascal"],
"magika.syntax_map.pem": ["scope:text.pem"],
"magika.syntax_map.perl": ["scope:source.perl"],
"magika.syntax_map.php": ["scope:embedding.php", "scope:text.html.php"],
"magika.syntax_map.po": ["scope:source.po"],
"magika.syntax_map.postscript": ["scope:source.postscript"],
"magika.syntax_map.powershell": ["scope:source.powershell"],
"magika.syntax_map.prolog": ["scope:source.prolog"],
"magika.syntax_map.proto": ["scope:source.proto"],
"magika.syntax_map.protobuf": ["scope:text.prototxt"],
"magika.syntax_map.python": ["scope:source.python"],
"magika.syntax_map.r": ["scope:source.r"],
"magika.syntax_map.rdf": ["=xml"],
"magika.syntax_map.rst": ["scope:text.restructuredtext"],
"magika.syntax_map.rtf": ["scope:text.rtf"],
"magika.syntax_map.ruby": ["scope:source.ruby"],
"magika.syntax_map.rust": ["scope:source.rust"],
"magika.syntax_map.scala": ["scope:source.scala"],
"magika.syntax_map.scss": ["scope:source.scss"],
"magika.syntax_map.shell": ["scope:source.shell.bash"],
"magika.syntax_map.smali": ["scope:source.smali"],
"magika.syntax_map.solidity": ["scope:source.solidity"],
"magika.syntax_map.sql": ["scope:source.sql"],
"magika.syntax_map.srt": ["scope:text.srt"],
"magika.syntax_map.svg": ["=xml"],
"magika.syntax_map.swift": ["scope:source.swift"],
"magika.syntax_map.tcl": ["scope:source.tcl"],
"magika.syntax_map.toml": ["scope:source.toml"],
"magika.syntax_map.tsx": ["scope:source.tsx"],
"magika.syntax_map.twig": ["scope:text.html.twig", "=jinja"],
"magika.syntax_map.txt": ["scope:text.plain"],
"magika.syntax_map.typescript": ["scope:source.ts"],
"magika.syntax_map.vba": ["scope:source.vbs"],
"magika.syntax_map.verilog": ["scope:source.verilog"],
"magika.syntax_map.vhdl": ["scope:source.vhdl"],
"magika.syntax_map.vtt": ["scope:text.vtt"],
"magika.syntax_map.vue": ["scope:text.html.vue"],
"magika.syntax_map.winregistry": ["scope:source.reg"],
"magika.syntax_map.xml": ["scope:text.xml"],
"magika.syntax_map.yaml": ["scope:source.yaml"]
"magika.syntax_map.yaml": ["scope:source.yaml"],
"magika.syntax_map.zig": ["scope:source.zig"]
}
25 changes: 18 additions & 7 deletions plugin/commands/auto_set_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,23 +267,34 @@ def _assign_syntax_with_magika(view_snapshot: ViewSnapshot, event: ListenerEvent
return False

try:
from magika import Magika, PredictionMode
from magika import ContentTypeLabel, Magika, PredictionMode
from magika import magika as magika_magika
except ImportError as e:
Logger.log(f"💣 Error occured while importing Magika: {e}", window=window)
return False

magika_magika.DEFAULT_MODEL_NAME = "fast_v2_1" # by default, it's "stable_v2_1"
magika = Magika(prediction_mode=PredictionMode.BEST_GUESS) # we have "magika.min_confidence" as the threshold
if view_snapshot.path_obj and not view.is_dirty():
status_result = magika.identify_path(view_snapshot.path_obj)
magika_result = magika.identify_path(view_snapshot.path_obj)
else:
status_result = magika.identify_bytes(view_snapshot.content_bytes)
# Logger.log(f"🐛 Magika's prediction: {status_result.output!r}", window=window)
magika_result = magika.identify_bytes(view_snapshot.content_bytes)
if not magika_result.ok:
Logger.log(f"😢 Magika failed: {magika_result.status}", window=window)
return False
Logger.log(f"🐛 Magika's prediction: {magika_result!r}", window=window)

magika_label = status_result.output.ct_label
magika_score = status_result.output.score # range: 0.0 ~ 1.0
magika_label = magika_result.output.label
magika_score = magika_result.score # range: 0.0 ~ 1.0

threadshold: float = settings.get("magika.min_confidence", 0.0)
if magika_score < threadshold or magika_label in {"directory", "empty", "txt", "unknown"}:
if magika_score < threadshold or magika_label in {
ContentTypeLabel.DIRECTORY,
ContentTypeLabel.EMPTY,
ContentTypeLabel.TXT,
ContentTypeLabel.UNDEFINED,
ContentTypeLabel.UNKNOWN,
}:
return False

syntax_map: dict[str, list[str]] = extract_prefixed_dict(settings, prefix="magika.syntax_map.")
Expand Down
13 changes: 8 additions & 5 deletions typings/magika/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from magika import magika as magika, prediction_mode as prediction_mode

Magika = magika.Magika
MagikaError = magika.MagikaError
PredictionMode = prediction_mode.PredictionMode
from magika import magika as magika
from magika.types import content_type_label as content_type_label, magika_error as magika_error, prediction_mode as prediction_mode

__version__: str
Magika = magika.Magika
MagikaError = magika_error.MagikaError
ContentTypeLabel = content_type_label.ContentTypeLabel
PredictionMode = prediction_mode.PredictionMode
19 changes: 0 additions & 19 deletions typings/magika/cli/magika.pyi

This file was deleted.

72 changes: 0 additions & 72 deletions typings/magika/content_types.pyi

This file was deleted.

2 changes: 1 addition & 1 deletion typings/magika/logger.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class SimpleLogger:
def __init__(self, use_colors: bool = False) -> None: ...
def setLevel(self, level: int) -> None: ...
def raw_print_to_stdout(self, msg: str) -> None: ...
def raw_print(self, msg: str, file: TextIO = ...) -> None: ...
def raw_print(self, msg: str, file: TextIO | None = None, flush: bool = True) -> None: ...
def debug(self, msg: str) -> None: ...
def info(self, msg: str) -> None: ...
def warning(self, msg: str) -> None: ...
Expand Down
19 changes: 8 additions & 11 deletions typings/magika/magika.pyi
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
from magika.content_types import ContentType as ContentType, ContentTypesManager as ContentTypesManager
from magika.logger import get_logger as get_logger
from magika.prediction_mode import PredictionMode as PredictionMode
from magika.types import MagikaOutputFields as MagikaOutputFields, MagikaResult as MagikaResult, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, ModelOutputFields as ModelOutputFields
from magika.seekable import Buffer as Buffer, File as File, Seekable as Seekable
from magika.types import ContentTypeInfo as ContentTypeInfo, ContentTypeLabel as ContentTypeLabel, MagikaError as MagikaError, MagikaPrediction as MagikaPrediction, MagikaResult as MagikaResult, ModelConfig as ModelConfig, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, PredictionMode as PredictionMode, Status as Status
from pathlib import Path
from typing import List, Optional

DEFAULT_MODEL_NAME: str

class Magika:
def __init__(self, model_dir: Optional[Path] = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ...
def __init__(self, model_dir: Path | None = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ...
def identify_path(self, path: Path) -> MagikaResult: ...
def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: ...
def identify_paths(self, paths: list[Path]) -> list[MagikaResult]: ...
def identify_bytes(self, content: bytes) -> MagikaResult: ...
@staticmethod
def get_default_model_name() -> str: ...
def get_model_name(self) -> str: ...

class MagikaError(Exception): ...
def get_supported_content_types(self) -> list[ContentTypeLabel]: ...
def get_model_dir_name(self) -> str: ...
10 changes: 0 additions & 10 deletions typings/magika/prediction_mode.pyi

This file was deleted.

19 changes: 19 additions & 0 deletions typings/magika/seekable.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import abc
from pathlib import Path

class Seekable(abc.ABC, metaclass=abc.ABCMeta):
def __init__(self) -> None: ...
@property
def size(self) -> int: ...
@abc.abstractmethod
def read_at(self, offset: int, size: int) -> bytes: ...
def close(self) -> None: ...

class File(Seekable):
def __init__(self, path: Path) -> None: ...
def read_at(self, offset: int, size: int) -> bytes: ...
def close(self) -> None: ...

class Buffer(Seekable):
def __init__(self, buffer: bytes) -> None: ...
def read_at(self, offset: int, size: int) -> bytes: ...
49 changes: 0 additions & 49 deletions typings/magika/types.pyi

This file was deleted.

10 changes: 10 additions & 0 deletions typings/magika/types/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from magika.types.content_type_info import ContentTypeInfo as ContentTypeInfo
from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel
from magika.types.magika_error import MagikaError as MagikaError
from magika.types.magika_prediction import MagikaPrediction as MagikaPrediction
from magika.types.magika_result import MagikaResult as MagikaResult
from magika.types.model import ModelConfig as ModelConfig, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput
from magika.types.prediction_mode import PredictionMode as PredictionMode
from magika.types.status import Status as Status

__all__ = ['ContentTypeInfo', 'ContentTypeLabel', 'MagikaError', 'MagikaPrediction', 'MagikaResult', 'ModelConfig', 'ModelFeatures', 'ModelOutput', 'PredictionMode', 'Status']
Loading

0 comments on commit da682ef

Please sign in to comment.