From da682ef3997a290dac07ffdb9b95b0a700765d73 Mon Sep 17 00:00:00 2001 From: Jack Cherng Date: Mon, 12 Aug 2024 15:49:51 +0800 Subject: [PATCH] refactor: use Magika v2 model Signed-off-by: Jack Cherng --- AutoSetSyntax.sublime-settings | 71 +++- plugin/commands/auto_set_syntax.py | 25 +- typings/magika/__init__.pyi | 13 +- typings/magika/cli/magika.pyi | 19 -- typings/magika/content_types.pyi | 72 ---- typings/magika/logger.pyi | 2 +- typings/magika/magika.pyi | 19 +- typings/magika/prediction_mode.pyi | 10 - typings/magika/seekable.pyi | 19 ++ typings/magika/types.pyi | 49 --- typings/magika/types/__init__.pyi | 10 + typings/magika/types/content_type_info.pyi | 19 ++ typings/magika/types/content_type_label.pyi | 349 ++++++++++++++++++++ typings/magika/types/magika_error.pyi | 1 + typings/magika/types/magika_prediction.pyi | 9 + typings/magika/types/magika_result.pyi | 22 ++ typings/magika/types/model.pyi | 34 ++ typings/magika/types/prediction_mode.pyi | 8 + typings/magika/types/status.pyi | 7 + typings/magika/{ => types}/strenum.pyi | 12 +- 20 files changed, 585 insertions(+), 185 deletions(-) delete mode 100644 typings/magika/cli/magika.pyi delete mode 100644 typings/magika/content_types.pyi delete mode 100644 typings/magika/prediction_mode.pyi create mode 100644 typings/magika/seekable.pyi delete mode 100644 typings/magika/types.pyi create mode 100644 typings/magika/types/__init__.pyi create mode 100644 typings/magika/types/content_type_info.pyi create mode 100644 typings/magika/types/content_type_label.pyi create mode 100644 typings/magika/types/magika_error.pyi create mode 100644 typings/magika/types/magika_prediction.pyi create mode 100644 typings/magika/types/magika_result.pyi create mode 100644 typings/magika/types/model.pyi create mode 100644 typings/magika/types/prediction_mode.pyi create mode 100644 typings/magika/types/status.pyi rename typings/magika/{ => types}/strenum.pyi (96%) diff --git a/AutoSetSyntax.sublime-settings b/AutoSetSyntax.sublime-settings index 2ff58252..684b5df7 100644 --- a/AutoSetSyntax.sublime-settings +++ b/AutoSetSyntax.sublime-settings @@ -694,9 +694,10 @@ // To use this feature, you have to install the "magika" library. // @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/experimental/dl-based-syntax-detection/#prerequisites "magika.enabled": false, - "magika.min_confidence": 0.85, + "magika.min_confidence": 0.5, // To list supported file types, run shell command: `$ magika --list-output-content-types` // @see https://github.com/google/magika/blob/main/docs/supported_content_types_list.md + "magika.syntax_map.ada": ["scope:source.ada"], "magika.syntax_map.appleplist": ["scope:text.xml.plist", "=xml"], "magika.syntax_map.asm": [ // no good way to do this? @@ -706,42 +707,102 @@ "scope:source.assembly" ], "magika.syntax_map.asp": ["scope:source.asp"], + "magika.syntax_map.autohotkey": ["scope:source.ahk"], + "magika.syntax_map.autoit": ["scope:source.autoit"], + "magika.syntax_map.awk": ["scope:source.awk"], "magika.syntax_map.batch": ["scope:source.dosbatch"], - "magika.syntax_map.c": ["scope:source.c++" /* magika can't distinguish between C and C++ */], + "magika.syntax_map.bazel": ["scope:source.bazel", "=python"], + "magika.syntax_map.brainfuck": ["scope:source.bf"], + "magika.syntax_map.c": ["scope:source.c"], + "magika.syntax_map.clojure": ["scope:source.clojure"], + "magika.syntax_map.cmake": ["scope:source.cmake"], + "magika.syntax_map.cobol": ["scope:source.cobol"], + "magika.syntax_map.coffeescript": ["scope:source.coffee"], + "magika.syntax_map.cpp": ["scope:source.c++"], "magika.syntax_map.cs": ["scope:source.cs"], - "magika.syntax_map.css": ["scope:source.scss", "scope:source.css"], + "magika.syntax_map.css": ["scope:source.css"], "magika.syntax_map.csv": ["scope:text.advanced_csv", "scope:text.csv"], + "magika.syntax_map.dart": ["scope:source.dart"], + "magika.syntax_map.diff": ["scope:source.diff"], + "magika.syntax_map.dm": ["scope:source.dm"], + "magika.syntax_map.dockerfile": ["scope:source.containerfile", "scope:source.dockerfile"], + "magika.syntax_map.elixir": ["scope:source.elixir"], + "magika.syntax_map.eml": ["scope:source.eml"], + "magika.syntax_map.erb": ["scope:source.ruby.rails"], + "magika.syntax_map.erlang": ["scope:source.erlang"], + "magika.syntax_map.fortran": ["scope:source.fortran", "scope:source.modern-fortran"], + "magika.syntax_map.gemfile": ["=ruby"], + "magika.syntax_map.gitattributes": ["scope:text.git.attributes"], + "magika.syntax_map.gleam": ["scope:source.gleam"], "magika.syntax_map.go": ["scope:source.go"], + "magika.syntax_map.gradle": ["scope:source.gradle"], + "magika.syntax_map.groovy": ["scope:source.groovy"], + "magika.syntax_map.h": ["=c"], + "magika.syntax_map.handlebars": ["scope:text.html.handlebars"], + "magika.syntax_map.haskell": ["scope:source.haskell"], + "magika.syntax_map.hpp": ["=cpp"], + "magika.syntax_map.htaccess": ["source.apacheconf"], "magika.syntax_map.html": ["scope:text.html.basic"], "magika.syntax_map.ini": ["scope:source.ini"], + "magika.syntax_map.ipynb": ["=json"], "magika.syntax_map.java": ["scope:source.java"], - "magika.syntax_map.javascript": ["scope:source.ts" /* magika can't distinguish between TypeScript and JavaScript */], + "magika.syntax_map.javascript": ["scope:source.js"], + "magika.syntax_map.jinja": ["scope:text.jinja", "scope:text.html.jinja"], "magika.syntax_map.json": ["scope:source.json"], + "magika.syntax_map.jsonc": ["scope:source.json"], + "magika.syntax_map.jsx": ["scope:source.jsx"], + "magika.syntax_map.julia": ["scope:source.julia"], + "magika.syntax_map.kotlin": ["scope:source.kotlin"], "magika.syntax_map.latex": ["scope:text.tex.latex"], + "magika.syntax_map.license": ["=markdown"], "magika.syntax_map.lisp": ["scope:source.lisp"], + "magika.syntax_map.lua": ["scope:source.lua"], "magika.syntax_map.m3u": ["scope:text.m3u"], "magika.syntax_map.makefile": ["scope:source.makefile"], "magika.syntax_map.markdown": ["scope:text.html.markdown"], + "magika.syntax_map.matlab": ["scope:source.matlab"], "magika.syntax_map.mum": ["=xml"], + "magika.syntax_map.ocaml": ["scope:source.ocaml"], + "magika.syntax_map.odin": ["scope:source.odin"], + "magika.syntax_map.pascal": ["scope:source.pascal"], "magika.syntax_map.pem": ["scope:text.pem"], "magika.syntax_map.perl": ["scope:source.perl"], "magika.syntax_map.php": ["scope:embedding.php", "scope:text.html.php"], + "magika.syntax_map.po": ["scope:source.po"], "magika.syntax_map.postscript": ["scope:source.postscript"], "magika.syntax_map.powershell": ["scope:source.powershell"], + "magika.syntax_map.prolog": ["scope:source.prolog"], + "magika.syntax_map.proto": ["scope:source.proto"], + "magika.syntax_map.protobuf": ["scope:text.prototxt"], "magika.syntax_map.python": ["scope:source.python"], + "magika.syntax_map.r": ["scope:source.r"], "magika.syntax_map.rdf": ["=xml"], "magika.syntax_map.rst": ["scope:text.restructuredtext"], "magika.syntax_map.rtf": ["scope:text.rtf"], "magika.syntax_map.ruby": ["scope:source.ruby"], "magika.syntax_map.rust": ["scope:source.rust"], "magika.syntax_map.scala": ["scope:source.scala"], + "magika.syntax_map.scss": ["scope:source.scss"], "magika.syntax_map.shell": ["scope:source.shell.bash"], "magika.syntax_map.smali": ["scope:source.smali"], + "magika.syntax_map.solidity": ["scope:source.solidity"], "magika.syntax_map.sql": ["scope:source.sql"], + "magika.syntax_map.srt": ["scope:text.srt"], "magika.syntax_map.svg": ["=xml"], + "magika.syntax_map.swift": ["scope:source.swift"], + "magika.syntax_map.tcl": ["scope:source.tcl"], + "magika.syntax_map.toml": ["scope:source.toml"], + "magika.syntax_map.tsx": ["scope:source.tsx"], + "magika.syntax_map.twig": ["scope:text.html.twig", "=jinja"], "magika.syntax_map.txt": ["scope:text.plain"], + "magika.syntax_map.typescript": ["scope:source.ts"], "magika.syntax_map.vba": ["scope:source.vbs"], + "magika.syntax_map.verilog": ["scope:source.verilog"], + "magika.syntax_map.vhdl": ["scope:source.vhdl"], + "magika.syntax_map.vtt": ["scope:text.vtt"], + "magika.syntax_map.vue": ["scope:text.html.vue"], "magika.syntax_map.winregistry": ["scope:source.reg"], "magika.syntax_map.xml": ["scope:text.xml"], - "magika.syntax_map.yaml": ["scope:source.yaml"] + "magika.syntax_map.yaml": ["scope:source.yaml"], + "magika.syntax_map.zig": ["scope:source.zig"] } diff --git a/plugin/commands/auto_set_syntax.py b/plugin/commands/auto_set_syntax.py index 1116f8a0..0435792d 100644 --- a/plugin/commands/auto_set_syntax.py +++ b/plugin/commands/auto_set_syntax.py @@ -267,23 +267,34 @@ def _assign_syntax_with_magika(view_snapshot: ViewSnapshot, event: ListenerEvent return False try: - from magika import Magika, PredictionMode + from magika import ContentTypeLabel, Magika, PredictionMode + from magika import magika as magika_magika except ImportError as e: Logger.log(f"💣 Error occured while importing Magika: {e}", window=window) return False + magika_magika.DEFAULT_MODEL_NAME = "fast_v2_1" # by default, it's "stable_v2_1" magika = Magika(prediction_mode=PredictionMode.BEST_GUESS) # we have "magika.min_confidence" as the threshold if view_snapshot.path_obj and not view.is_dirty(): - status_result = magika.identify_path(view_snapshot.path_obj) + magika_result = magika.identify_path(view_snapshot.path_obj) else: - status_result = magika.identify_bytes(view_snapshot.content_bytes) - # Logger.log(f"🐛 Magika's prediction: {status_result.output!r}", window=window) + magika_result = magika.identify_bytes(view_snapshot.content_bytes) + if not magika_result.ok: + Logger.log(f"😢 Magika failed: {magika_result.status}", window=window) + return False + Logger.log(f"🐛 Magika's prediction: {magika_result!r}", window=window) - magika_label = status_result.output.ct_label - magika_score = status_result.output.score # range: 0.0 ~ 1.0 + magika_label = magika_result.output.label + magika_score = magika_result.score # range: 0.0 ~ 1.0 threadshold: float = settings.get("magika.min_confidence", 0.0) - if magika_score < threadshold or magika_label in {"directory", "empty", "txt", "unknown"}: + if magika_score < threadshold or magika_label in { + ContentTypeLabel.DIRECTORY, + ContentTypeLabel.EMPTY, + ContentTypeLabel.TXT, + ContentTypeLabel.UNDEFINED, + ContentTypeLabel.UNKNOWN, + }: return False syntax_map: dict[str, list[str]] = extract_prefixed_dict(settings, prefix="magika.syntax_map.") diff --git a/typings/magika/__init__.pyi b/typings/magika/__init__.pyi index 3c4eef3d..94cab2b0 100644 --- a/typings/magika/__init__.pyi +++ b/typings/magika/__init__.pyi @@ -1,5 +1,8 @@ -from magika import magika as magika, prediction_mode as prediction_mode - -Magika = magika.Magika -MagikaError = magika.MagikaError -PredictionMode = prediction_mode.PredictionMode +from magika import magika as magika +from magika.types import content_type_label as content_type_label, magika_error as magika_error, prediction_mode as prediction_mode + +__version__: str +Magika = magika.Magika +MagikaError = magika_error.MagikaError +ContentTypeLabel = content_type_label.ContentTypeLabel +PredictionMode = prediction_mode.PredictionMode diff --git a/typings/magika/cli/magika.pyi b/typings/magika/cli/magika.pyi deleted file mode 100644 index 55e48598..00000000 --- a/typings/magika/cli/magika.pyi +++ /dev/null @@ -1,19 +0,0 @@ -from _typeshed import Incomplete -from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors -from magika.content_types import ContentTypesManager as ContentTypesManager -from magika.logger import get_logger as get_logger -from magika.types import FeedbackReport as FeedbackReport, MagikaResult as MagikaResult -from pathlib import Path -from typing import List, Optional - -VERSION: str -CONTACT_EMAIL: str -CONTEXT_SETTINGS: Incomplete -HELP_EPILOG: Incomplete - -def main(file: List[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, generate_report_flag: bool, output_version: bool, list_output_content_types: bool, model_dir: Optional[Path]) -> None: ... -def should_read_from_stdin(files_paths: List[Path]) -> bool: ... -def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: ... -def generate_feedback_report(magika: Magika, file_path: Path, magika_result: MagikaResult) -> FeedbackReport: ... -def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: ... -def print_output_content_types_list() -> None: ... diff --git a/typings/magika/content_types.pyi b/typings/magika/content_types.pyi deleted file mode 100644 index 5f376e95..00000000 --- a/typings/magika/content_types.pyi +++ /dev/null @@ -1,72 +0,0 @@ -from _typeshed import Incomplete -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional - -CONTENT_TYPES_CONFIG_PATH: Incomplete - -class ContentType: - UNKNOWN: str - UNKNOWN_MIME_TYPE: str - UNKNOWN_CONTENT_TYPE_GROUP: str - UNKNOWN_MAGIC: str - UNKNOWN_DESCRIPTION: str - UNSUPPORTED: str - ERROR: str - MISSING: str - EMPTY: str - CORRUPTED: str - TIMEOUT: str - NOT_VALID: str - FILE_DOES_NOT_EXIST: str - PERMISSION_ERROR: str - DIRECTORY: str - SYMLINK: str - GENERIC_TEXT: str - name: Incomplete - extensions: Incomplete - mime_type: Incomplete - group: Incomplete - magic: Incomplete - description: Incomplete - vt_type: Incomplete - datasets: Incomplete - parent: Incomplete - tags: Incomplete - model_target_label: Incomplete - target_label: Incomplete - correct_labels: Incomplete - in_scope_for_output_content_type: Incomplete - def __init__(self, name: str, extensions: List[str], mime_type: Optional[str], group: Optional[str], magic: Optional[str], description: Optional[str], vt_type: Optional[str], datasets: List[str], parent: Optional[str], tags: List[str], model_target_label: Optional[str], target_label: Optional[str], correct_labels: List[str], in_scope_for_output_content_type: bool, add_automatic_tags: bool = True) -> None: ... - @property - def is_text(self) -> bool: ... - @property - def in_scope_for_training(self) -> bool: ... - def to_dict(self) -> Dict[str, Any]: ... - @staticmethod - def from_dict(info_d: Dict, add_automatic_tags: bool = True) -> ContentType: ... - -class ContentTypesManager: - SPECIAL_CONTENT_TYPES: List[str] - SUPPORTED_TARGET_LABELS_SPEC: Incomplete - cts: Incomplete - tag2cts: Incomplete - ext2cts: Incomplete - def __init__(self, content_type_config_path: Path = ..., add_automatic_tags: bool = True) -> None: ... - def load_content_types_info(self, content_type_config_path: Path, add_automatic_tags: bool = True) -> None: ... - def get(self, content_type_name: str) -> Optional[ContentType]: ... - def get_or_raise(self, content_type_name: Optional[str]) -> ContentType: ... - def get_mime_type(self, content_type_name: str, default: str = ...) -> str: ... - def get_group(self, content_type_name: str, default: str = ...) -> str: ... - def get_magic(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ... - def get_description(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ... - def get_cts_by_ext(self, ext: str) -> List[ContentType]: ... - def get_cts_by_ext_or_raise(self, ext: str) -> List[ContentType]: ... - def get_valid_tags(self, only_explicit: bool = True) -> List[str]: ... - def is_valid_ct_label(self, label: str) -> bool: ... - def is_valid_tag(self, tag: str) -> bool: ... - def select(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[ContentType]: ... - def select_names(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[str]: ... - def get_content_types_space(self) -> List[str]: ... - def get_output_content_types(self) -> List[ContentType]: ... - def get_output_content_types_names(self) -> List[str]: ... - def get_invalid_labels(self, labels: Iterable[str]) -> List[str]: ... diff --git a/typings/magika/logger.pyi b/typings/magika/logger.pyi index f1f3ce17..5929e4c8 100644 --- a/typings/magika/logger.pyi +++ b/typings/magika/logger.pyi @@ -8,7 +8,7 @@ class SimpleLogger: def __init__(self, use_colors: bool = False) -> None: ... def setLevel(self, level: int) -> None: ... def raw_print_to_stdout(self, msg: str) -> None: ... - def raw_print(self, msg: str, file: TextIO = ...) -> None: ... + def raw_print(self, msg: str, file: TextIO | None = None, flush: bool = True) -> None: ... def debug(self, msg: str) -> None: ... def info(self, msg: str) -> None: ... def warning(self, msg: str) -> None: ... diff --git a/typings/magika/magika.pyi b/typings/magika/magika.pyi index c9873c78..be0edcc5 100644 --- a/typings/magika/magika.pyi +++ b/typings/magika/magika.pyi @@ -1,17 +1,14 @@ -from magika.content_types import ContentType as ContentType, ContentTypesManager as ContentTypesManager from magika.logger import get_logger as get_logger -from magika.prediction_mode import PredictionMode as PredictionMode -from magika.types import MagikaOutputFields as MagikaOutputFields, MagikaResult as MagikaResult, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, ModelOutputFields as ModelOutputFields +from magika.seekable import Buffer as Buffer, File as File, Seekable as Seekable +from magika.types import ContentTypeInfo as ContentTypeInfo, ContentTypeLabel as ContentTypeLabel, MagikaError as MagikaError, MagikaPrediction as MagikaPrediction, MagikaResult as MagikaResult, ModelConfig as ModelConfig, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, PredictionMode as PredictionMode, Status as Status from pathlib import Path -from typing import List, Optional + +DEFAULT_MODEL_NAME: str class Magika: - def __init__(self, model_dir: Optional[Path] = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ... + def __init__(self, model_dir: Path | None = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ... def identify_path(self, path: Path) -> MagikaResult: ... - def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: ... + def identify_paths(self, paths: list[Path]) -> list[MagikaResult]: ... def identify_bytes(self, content: bytes) -> MagikaResult: ... - @staticmethod - def get_default_model_name() -> str: ... - def get_model_name(self) -> str: ... - -class MagikaError(Exception): ... + def get_supported_content_types(self) -> list[ContentTypeLabel]: ... + def get_model_dir_name(self) -> str: ... diff --git a/typings/magika/prediction_mode.pyi b/typings/magika/prediction_mode.pyi deleted file mode 100644 index f3ec6210..00000000 --- a/typings/magika/prediction_mode.pyi +++ /dev/null @@ -1,10 +0,0 @@ -from _typeshed import Incomplete -from magika.strenum import LowerCaseStrEnum as LowerCaseStrEnum -from typing import List - -class PredictionMode(LowerCaseStrEnum): - BEST_GUESS: Incomplete - MEDIUM_CONFIDENCE: Incomplete - HIGH_CONFIDENCE: Incomplete - @staticmethod - def get_valid_prediction_modes() -> List[str]: ... diff --git a/typings/magika/seekable.pyi b/typings/magika/seekable.pyi new file mode 100644 index 00000000..3ad71f40 --- /dev/null +++ b/typings/magika/seekable.pyi @@ -0,0 +1,19 @@ +import abc +from pathlib import Path + +class Seekable(abc.ABC, metaclass=abc.ABCMeta): + def __init__(self) -> None: ... + @property + def size(self) -> int: ... + @abc.abstractmethod + def read_at(self, offset: int, size: int) -> bytes: ... + def close(self) -> None: ... + +class File(Seekable): + def __init__(self, path: Path) -> None: ... + def read_at(self, offset: int, size: int) -> bytes: ... + def close(self) -> None: ... + +class Buffer(Seekable): + def __init__(self, buffer: bytes) -> None: ... + def read_at(self, offset: int, size: int) -> bytes: ... diff --git a/typings/magika/types.pyi b/typings/magika/types.pyi deleted file mode 100644 index c55fbe89..00000000 --- a/typings/magika/types.pyi +++ /dev/null @@ -1,49 +0,0 @@ -from dataclasses import dataclass -from typing import List, Optional - -@dataclass -class ModelFeatures: - beg: List[int] - mid: List[int] - end: List[int] - def __init__(self, beg, mid, end) -> None: ... - -@dataclass -class ModelOutput: - ct_label: str - score: float - def __init__(self, ct_label, score) -> None: ... - -@dataclass -class MagikaResult: - path: str - dl: ModelOutputFields - output: MagikaOutputFields - def __init__(self, path, dl, output) -> None: ... - -@dataclass -class ModelOutputFields: - ct_label: Optional[str] - score: Optional[float] - group: Optional[str] - mime_type: Optional[str] - magic: Optional[str] - description: Optional[str] - def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ... - -@dataclass -class MagikaOutputFields: - ct_label: str - score: float - group: str - mime_type: str - magic: str - description: str - def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ... - -@dataclass -class FeedbackReport: - hash: str - features: ModelFeatures - result: MagikaResult - def __init__(self, hash, features, result) -> None: ... diff --git a/typings/magika/types/__init__.pyi b/typings/magika/types/__init__.pyi new file mode 100644 index 00000000..26e2cc2d --- /dev/null +++ b/typings/magika/types/__init__.pyi @@ -0,0 +1,10 @@ +from magika.types.content_type_info import ContentTypeInfo as ContentTypeInfo +from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel +from magika.types.magika_error import MagikaError as MagikaError +from magika.types.magika_prediction import MagikaPrediction as MagikaPrediction +from magika.types.magika_result import MagikaResult as MagikaResult +from magika.types.model import ModelConfig as ModelConfig, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput +from magika.types.prediction_mode import PredictionMode as PredictionMode +from magika.types.status import Status as Status + +__all__ = ['ContentTypeInfo', 'ContentTypeLabel', 'MagikaError', 'MagikaPrediction', 'MagikaResult', 'ModelConfig', 'ModelFeatures', 'ModelOutput', 'PredictionMode', 'Status'] diff --git a/typings/magika/types/content_type_info.pyi b/typings/magika/types/content_type_info.pyi new file mode 100644 index 00000000..953c6716 --- /dev/null +++ b/typings/magika/types/content_type_info.pyi @@ -0,0 +1,19 @@ +from dataclasses import dataclass +from magika.logger import get_logger as get_logger +from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel + +@dataclass(frozen=True) +class ContentTypeInfo: + label: ContentTypeLabel + mime_type: str + group: str + description: str + extensions: list[str] + is_text: bool + @property + def ct_label(self) -> str: ... + @property + def score(self) -> float: ... + @property + def magic(self) -> str: ... + def __init__(self, label, mime_type, group, description, extensions, is_text) -> None: ... diff --git a/typings/magika/types/content_type_label.pyi b/typings/magika/types/content_type_label.pyi new file mode 100644 index 00000000..b8f13ecf --- /dev/null +++ b/typings/magika/types/content_type_label.pyi @@ -0,0 +1,349 @@ +from magika.types.strenum import StrEnum as StrEnum + +class ContentTypeLabel(StrEnum): + ABNF = 'abnf' + ACE = 'ace' + ADA = 'ada' + AFF = 'aff' + AI = 'ai' + AIDL = 'aidl' + ALGOL68 = 'algol68' + ANI = 'ani' + APK = 'apk' + APPLEBPLIST = 'applebplist' + APPLEDOUBLE = 'appledouble' + APPLEPLIST = 'appleplist' + APPLESINGLE = 'applesingle' + AR = 'ar' + ARC = 'arc' + ARJ = 'arj' + ARROW = 'arrow' + ASC = 'asc' + ASD = 'asd' + ASF = 'asf' + ASM = 'asm' + ASP = 'asp' + AU = 'au' + AUTOHOTKEY = 'autohotkey' + AUTOIT = 'autoit' + AVI = 'avi' + AVIF = 'avif' + AVRO = 'avro' + AWK = 'awk' + AX = 'ax' + BATCH = 'batch' + BAZEL = 'bazel' + BCAD = 'bcad' + BIB = 'bib' + BMP = 'bmp' + BPG = 'bpg' + BPL = 'bpl' + BRAINFUCK = 'brainfuck' + BRF = 'brf' + BZIP = 'bzip' + BZIP3 = 'bzip3' + C = 'c' + CAB = 'cab' + CAD = 'cad' + CAT = 'cat' + CDF = 'cdf' + CHM = 'chm' + CLOJURE = 'clojure' + CMAKE = 'cmake' + COBOL = 'cobol' + COFF = 'coff' + COFFEESCRIPT = 'coffeescript' + COM = 'com' + CPL = 'cpl' + CPP = 'cpp' + CRT = 'crt' + CRX = 'crx' + CS = 'cs' + CSPROJ = 'csproj' + CSS = 'css' + CSV = 'csv' + CTL = 'ctl' + DART = 'dart' + DEB = 'deb' + DEX = 'dex' + DEY = 'dey' + DICOM = 'dicom' + DIFF = 'diff' + DIRECTORY = 'directory' + DJANGO = 'django' + DLL = 'dll' + DM = 'dm' + DMG = 'dmg' + DMIGD = 'dmigd' + DMSCRIPT = 'dmscript' + DOC = 'doc' + DOCKERFILE = 'dockerfile' + DOCX = 'docx' + DOSMBR = 'dosmbr' + DOTX = 'dotx' + DSSTORE = 'dsstore' + DWG = 'dwg' + DXF = 'dxf' + DYLIB = 'dylib' + EBML = 'ebml' + ELF = 'elf' + ELIXIR = 'elixir' + EMF = 'emf' + EML = 'eml' + EMPTY = 'empty' + EPUB = 'epub' + ERB = 'erb' + ERLANG = 'erlang' + ESE = 'ese' + EXE = 'exe' + EXP = 'exp' + FLAC = 'flac' + FLUTTER = 'flutter' + FLV = 'flv' + FORTRAN = 'fortran' + FPX = 'fpx' + GEMFILE = 'gemfile' + GEMSPEC = 'gemspec' + GIF = 'gif' + GITATTRIBUTES = 'gitattributes' + GITMODULES = 'gitmodules' + GLEAM = 'gleam' + GO = 'go' + GPX = 'gpx' + GRADLE = 'gradle' + GROOVY = 'groovy' + GZIP = 'gzip' + H = 'h' + H5 = 'h5' + HANDLEBARS = 'handlebars' + HASKELL = 'haskell' + HCL = 'hcl' + HEIF = 'heif' + HFS = 'hfs' + HLP = 'hlp' + HPP = 'hpp' + HTA = 'hta' + HTACCESS = 'htaccess' + HTML = 'html' + HVE = 'hve' + HWP = 'hwp' + ICC = 'icc' + ICNS = 'icns' + ICO = 'ico' + ICS = 'ics' + IGNOREFILE = 'ignorefile' + IMG = 'img' + INI = 'ini' + INTERNETSHORTCUT = 'internetshortcut' + IOSAPP = 'iosapp' + IPYNB = 'ipynb' + ISO = 'iso' + JAR = 'jar' + JAVA = 'java' + JAVABYTECODE = 'javabytecode' + JAVASCRIPT = 'javascript' + JINJA = 'jinja' + JNG = 'jng' + JNLP = 'jnlp' + JP2 = 'jp2' + JPEG = 'jpeg' + JSON = 'json' + JSONC = 'jsonc' + JSONL = 'jsonl' + JSX = 'jsx' + JULIA = 'julia' + JXL = 'jxl' + KO = 'ko' + KOTLIN = 'kotlin' + KS = 'ks' + LATEX = 'latex' + LATEXAUX = 'latexaux' + LESS = 'less' + LHA = 'lha' + LICENSE = 'license' + LISP = 'lisp' + LITCS = 'litcs' + LNK = 'lnk' + LOCK = 'lock' + LRZ = 'lrz' + LUA = 'lua' + LZ = 'lz' + LZ4 = 'lz4' + LZX = 'lzx' + M3U = 'm3u' + M4 = 'm4' + MACHO = 'macho' + MAFF = 'maff' + MAKEFILE = 'makefile' + MARKDOWN = 'markdown' + MATLAB = 'matlab' + MHT = 'mht' + MIDI = 'midi' + MKV = 'mkv' + MP2 = 'mp2' + MP3 = 'mp3' + MP4 = 'mp4' + MPEGTS = 'mpegts' + MSCOMPRESS = 'mscompress' + MSI = 'msi' + MSIX = 'msix' + MST = 'mst' + MUI = 'mui' + MUM = 'mum' + MUN = 'mun' + NIM = 'nim' + NPY = 'npy' + NPZ = 'npz' + NULL = 'null' + NUPKG = 'nupkg' + OBJECT = 'object' + OBJECTIVEC = 'objectivec' + OCAML = 'ocaml' + OCX = 'ocx' + ODEX = 'odex' + ODIN = 'odin' + ODP = 'odp' + ODS = 'ods' + ODT = 'odt' + OGG = 'ogg' + OLE = 'ole' + ONE = 'one' + ONNX = 'onnx' + OOXML = 'ooxml' + OTF = 'otf' + OUTLOOK = 'outlook' + PALMOS = 'palmos' + PARQUET = 'parquet' + PASCAL = 'pascal' + PBM = 'pbm' + PCAP = 'pcap' + PDB = 'pdb' + PDF = 'pdf' + PEBIN = 'pebin' + PEM = 'pem' + PERL = 'perl' + PGP = 'pgp' + PHP = 'php' + PICKLE = 'pickle' + PNG = 'png' + PO = 'po' + POSTSCRIPT = 'postscript' + POWERSHELL = 'powershell' + PPT = 'ppt' + PPTX = 'pptx' + PRINTFOX = 'printfox' + PROLOG = 'prolog' + PROTEINDB = 'proteindb' + PROTO = 'proto' + PROTOBUF = 'protobuf' + PSD = 'psd' + PUB = 'pub' + PYTHON = 'python' + PYTHONBYTECODE = 'pythonbytecode' + PYTHONPAR = 'pythonpar' + PYTORCH = 'pytorch' + QOI = 'qoi' + QT = 'qt' + R = 'r' + RANDOMASCII = 'randomascii' + RANDOMBYTES = 'randombytes' + RAR = 'rar' + RDF = 'rdf' + RIFF = 'riff' + RLIB = 'rlib' + RLL = 'rll' + RPM = 'rpm' + RST = 'rst' + RTF = 'rtf' + RUBY = 'ruby' + RUST = 'rust' + RZIP = 'rzip' + SCALA = 'scala' + SCHEME = 'scheme' + SCR = 'scr' + SCRIPTWSF = 'scriptwsf' + SCSS = 'scss' + SEVENZIP = 'sevenzip' + SGML = 'sgml' + SH3D = 'sh3d' + SHELL = 'shell' + SMALI = 'smali' + SNAP = 'snap' + SO = 'so' + SOLIDITY = 'solidity' + SQL = 'sql' + SQLITE = 'sqlite' + SQUASHFS = 'squashfs' + SRT = 'srt' + STLBINARY = 'stlbinary' + STLTEXT = 'stltext' + SUM = 'sum' + SVD = 'svd' + SVG = 'svg' + SWF = 'swf' + SWIFT = 'swift' + SYMLINK = 'symlink' + SYMLINKTEXT = 'symlinktext' + SYS = 'sys' + TAR = 'tar' + TCL = 'tcl' + TEXTPROTO = 'textproto' + TGA = 'tga' + THUMBSDB = 'thumbsdb' + TIFF = 'tiff' + TMDX = 'tmdx' + TOML = 'toml' + TORRENT = 'torrent' + TROFF = 'troff' + TSV = 'tsv' + TSX = 'tsx' + TTF = 'ttf' + TWIG = 'twig' + TXT = 'txt' + TXTASCII = 'txtascii' + TXTUTF16 = 'txtutf16' + TXTUTF8 = 'txtutf8' + TYPESCRIPT = 'typescript' + UDF = 'udf' + UNDEFINED = 'undefined' + UNIXCOMPRESS = 'unixcompress' + UNKNOWN = 'unknown' + VBA = 'vba' + VBE = 'vbe' + VCARD = 'vcard' + VCS = 'vcs' + VCXPROJ = 'vcxproj' + VERILOG = 'verilog' + VHD = 'vhd' + VHDL = 'vhdl' + VISIO = 'visio' + VTT = 'vtt' + VUE = 'vue' + WAD = 'wad' + WASM = 'wasm' + WAV = 'wav' + WEBM = 'webm' + WEBP = 'webp' + WEBTEMPLATE = 'webtemplate' + WIM = 'wim' + WINREGISTRY = 'winregistry' + WMA = 'wma' + WMF = 'wmf' + WMV = 'wmv' + WOFF = 'woff' + WOFF2 = 'woff2' + XAR = 'xar' + XCF = 'xcf' + XLS = 'xls' + XLSB = 'xlsb' + XLSX = 'xlsx' + XML = 'xml' + XPI = 'xpi' + XSD = 'xsd' + XZ = 'xz' + YAML = 'yaml' + YARA = 'yara' + ZIG = 'zig' + ZIP = 'zip' + ZLIBSTREAM = 'zlibstream' + ZST = 'zst' diff --git a/typings/magika/types/magika_error.pyi b/typings/magika/types/magika_error.pyi new file mode 100644 index 00000000..9ef1fc5f --- /dev/null +++ b/typings/magika/types/magika_error.pyi @@ -0,0 +1 @@ +class MagikaError(Exception): ... diff --git a/typings/magika/types/magika_prediction.pyi b/typings/magika/types/magika_prediction.pyi new file mode 100644 index 00000000..27d4d0e3 --- /dev/null +++ b/typings/magika/types/magika_prediction.pyi @@ -0,0 +1,9 @@ +from dataclasses import dataclass +from magika.types.content_type_info import ContentTypeInfo as ContentTypeInfo + +@dataclass(frozen=True) +class MagikaPrediction: + dl: ContentTypeInfo + output: ContentTypeInfo + score: float + def __init__(self, dl, output, score) -> None: ... diff --git a/typings/magika/types/magika_result.pyi b/typings/magika/types/magika_result.pyi new file mode 100644 index 00000000..2fcbe1ca --- /dev/null +++ b/typings/magika/types/magika_result.pyi @@ -0,0 +1,22 @@ +from magika.types.content_type_info import ContentTypeInfo as ContentTypeInfo +from magika.types.magika_prediction import MagikaPrediction as MagikaPrediction +from magika.types.status import Status as Status +from pathlib import Path + +class MagikaResult: + def __init__(self, *, path: Path, status: Status = ..., prediction: MagikaPrediction | None = None) -> None: ... + def __post_init__(self) -> None: ... + @property + def path(self) -> Path: ... + @property + def ok(self) -> bool: ... + @property + def status(self) -> Status: ... + @property + def prediction(self) -> MagikaPrediction: ... + @property + def dl(self) -> ContentTypeInfo: ... + @property + def output(self) -> ContentTypeInfo: ... + @property + def score(self) -> float: ... diff --git a/typings/magika/types/model.pyi b/typings/magika/types/model.pyi new file mode 100644 index 00000000..d00ca2a4 --- /dev/null +++ b/typings/magika/types/model.pyi @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel + +@dataclass(frozen=True) +class ModelFeatures: + beg: list[int] + mid: list[int] + end: list[int] + offset_0x8000_0x8007: list[int] + offset_0x8800_0x8807: list[int] + offset_0x9000_0x9007: list[int] + offset_0x9800_0x9807: list[int] + def __init__(self, beg, mid, end, offset_0x8000_0x8007, offset_0x8800_0x8807, offset_0x9000_0x9007, offset_0x9800_0x9807) -> None: ... + +@dataclass(frozen=True) +class ModelOutput: + ct_label: ContentTypeLabel + score: float + def __init__(self, ct_label, score) -> None: ... + +@dataclass(frozen=True) +class ModelConfig: + beg_size: int + mid_size: int + end_size: int + use_inputs_at_offsets: bool + medium_confidence_threshold: float + min_file_size_for_dl: int + padding_token: int + block_size: int + target_labels_space: list[ContentTypeLabel] + thresholds: dict[ContentTypeLabel, float] + overwrite_map: dict[ContentTypeLabel, ContentTypeLabel] + def __init__(self, beg_size, mid_size, end_size, use_inputs_at_offsets, medium_confidence_threshold, min_file_size_for_dl, padding_token, block_size, target_labels_space, thresholds, overwrite_map) -> None: ... diff --git a/typings/magika/types/prediction_mode.pyi b/typings/magika/types/prediction_mode.pyi new file mode 100644 index 00000000..83879311 --- /dev/null +++ b/typings/magika/types/prediction_mode.pyi @@ -0,0 +1,8 @@ +from magika.types.strenum import LowerCaseStrEnum as LowerCaseStrEnum + +class PredictionMode(LowerCaseStrEnum): + BEST_GUESS = ... + MEDIUM_CONFIDENCE = ... + HIGH_CONFIDENCE = ... + @staticmethod + def get_valid_prediction_modes() -> list[str]: ... diff --git a/typings/magika/types/status.pyi b/typings/magika/types/status.pyi new file mode 100644 index 00000000..642b1d31 --- /dev/null +++ b/typings/magika/types/status.pyi @@ -0,0 +1,7 @@ +from magika.types.strenum import StrEnum as StrEnum + +class Status(StrEnum): + OK = 'ok' + FILE_NOT_FOUND_ERROR = 'file_not_found_error' + PERMISSION_ERROR = 'permission_error' + UNKNOWN = 'unknown' diff --git a/typings/magika/strenum.pyi b/typings/magika/types/strenum.pyi similarity index 96% rename from typings/magika/strenum.pyi rename to typings/magika/types/strenum.pyi index 05cab4e5..c74e90fe 100644 --- a/typings/magika/strenum.pyi +++ b/typings/magika/types/strenum.pyi @@ -1,6 +1,6 @@ -import enum - -class StrEnum(str, enum.Enum): - def __new__(cls, value: str | StrEnum, *args, **kwargs): ... - -class LowerCaseStrEnum(StrEnum): ... +import enum + +class StrEnum(str, enum.Enum): + def __new__(cls, value: str | StrEnum, *args, **kwargs): ... + +class LowerCaseStrEnum(StrEnum): ...