From 3dce538556daed9eaa52dc3cfad401ad793e358d Mon Sep 17 00:00:00 2001 From: Jack Cherng Date: Mon, 12 Aug 2024 15:49:51 +0800 Subject: [PATCH] refactor: use Magika v2 model Signed-off-by: Jack Cherng --- plugin/commands/auto_set_syntax.py | 27 +- typings/magika/__init__.pyi | 3 +- typings/magika/cli/magika.pyi | 19 - .../cli/magika_python_module_tester.pyi | 15 + typings/magika/content_types.pyi | 72 ---- typings/magika/magika.pyi | 19 +- typings/magika/prediction_mode.pyi | 10 - typings/magika/seekable.pyi | 19 + typings/magika/types.pyi | 49 --- typings/magika/types/__init__.pyi | 9 + typings/magika/types/content_type_info.pyi | 12 + typings/magika/types/content_type_label.pyi | 347 ++++++++++++++++++ typings/magika/types/magika_result.pyi | 9 + typings/magika/types/model.pyi | 34 ++ typings/magika/types/prediction_mode.pyi | 8 + typings/magika/types/status.pyi | 7 + typings/magika/types/statusor.pyi | 14 + typings/magika/{ => types}/strenum.pyi | 12 +- 18 files changed, 510 insertions(+), 175 deletions(-) delete mode 100644 typings/magika/cli/magika.pyi create mode 100644 typings/magika/cli/magika_python_module_tester.pyi delete mode 100644 typings/magika/content_types.pyi delete mode 100644 typings/magika/prediction_mode.pyi create mode 100644 typings/magika/seekable.pyi delete mode 100644 typings/magika/types.pyi create mode 100644 typings/magika/types/__init__.pyi create mode 100644 typings/magika/types/content_type_info.pyi create mode 100644 typings/magika/types/content_type_label.pyi create mode 100644 typings/magika/types/magika_result.pyi create mode 100644 typings/magika/types/model.pyi create mode 100644 typings/magika/types/prediction_mode.pyi create mode 100644 typings/magika/types/status.pyi create mode 100644 typings/magika/types/statusor.pyi rename typings/magika/{ => types}/strenum.pyi (96%) diff --git a/plugin/commands/auto_set_syntax.py b/plugin/commands/auto_set_syntax.py index 79c8ba8e..57606943 100644 --- a/plugin/commands/auto_set_syntax.py +++ b/plugin/commands/auto_set_syntax.py @@ -274,26 +274,35 @@ def _assign_syntax_with_magika(view_snapshot: ViewSnapshot, event: ListenerEvent magika = Magika(prediction_mode=PredictionMode.BEST_GUESS) # we have "magika.min_confidence" as the threshold if view_snapshot.path_obj and not view.is_dirty(): - result = magika.identify_path(view_snapshot.path_obj) + status_result = magika.identify_path(view_snapshot.path_obj) else: - result = magika.identify_bytes(view_snapshot.content_bytes) - # Logger.log(f"🐛 Magika's prediction: {result.output}", window=window) + status_result = magika.identify_bytes(view_snapshot.content_bytes) + # Logger.log(f"🐛 Magika's prediction: {status_result.value!r}", window=window) + + if not status_result.ok: + Logger.log(f"😢 Magika failed: {status_result.status}", window=window) + return False + + result = status_result.value + print(f"{result!r}") # @todo remove debug print + magika_label = result.output.label + magika_score = result.score # range: 0.0 ~ 1.0 threadshold: float = settings.get("magika.min_confidence", 0.0) - if result.output.score < threadshold or result.output.ct_label in {"directory", "empty", "txt", "unknown"}: + if magika_score < threadshold or magika_label in {"directory", "empty", "txt", "unknown"}: return False syntax_map: dict[str, list[str]] = extract_prefixed_dict(settings, prefix="magika.syntax_map.") - if not (syntax_likes := resolve_magika_label_with_syntax_map(result.output.ct_label, syntax_map)): - Logger.log(f"😢 Magika syntax map resolution failed for label: {result.output.ct_label}", window=window) + if not (syntax_likes := resolve_magika_label_with_syntax_map(magika_label, syntax_map)): + Logger.log(f"😢 Magika syntax map resolution failed for label: {magika_label}", window=window) return False if not (syntax := find_syntax_by_syntax_likes(syntax_likes, include_plaintext=False)): - Logger.log(f"😢 Failed finding syntax from Magika: {syntax_likes}", window=window) + Logger.log(f"😢 Failed mapping the label from Magika: {syntax_likes}", window=window) return False - confidence = round(result.output.score * 100, 2) - sublime.status_message(f"Predicted syntax: {result.output.ct_label} ({confidence}% confidence)") + confidence = round(magika_score * 100, 2) + sublime.status_message(f"Predicted label: {magika_label} ({confidence}% confidence)") return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"}) diff --git a/typings/magika/__init__.pyi b/typings/magika/__init__.pyi index 3c4eef3d..5ff5c9e7 100644 --- a/typings/magika/__init__.pyi +++ b/typings/magika/__init__.pyi @@ -1,4 +1,5 @@ -from magika import magika as magika, prediction_mode as prediction_mode +from magika import magika as magika +from magika.types import prediction_mode as prediction_mode Magika = magika.Magika MagikaError = magika.MagikaError diff --git a/typings/magika/cli/magika.pyi b/typings/magika/cli/magika.pyi deleted file mode 100644 index 55e48598..00000000 --- a/typings/magika/cli/magika.pyi +++ /dev/null @@ -1,19 +0,0 @@ -from _typeshed import Incomplete -from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors -from magika.content_types import ContentTypesManager as ContentTypesManager -from magika.logger import get_logger as get_logger -from magika.types import FeedbackReport as FeedbackReport, MagikaResult as MagikaResult -from pathlib import Path -from typing import List, Optional - -VERSION: str -CONTACT_EMAIL: str -CONTEXT_SETTINGS: Incomplete -HELP_EPILOG: Incomplete - -def main(file: List[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, generate_report_flag: bool, output_version: bool, list_output_content_types: bool, model_dir: Optional[Path]) -> None: ... -def should_read_from_stdin(files_paths: List[Path]) -> bool: ... -def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: ... -def generate_feedback_report(magika: Magika, file_path: Path, magika_result: MagikaResult) -> FeedbackReport: ... -def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: ... -def print_output_content_types_list() -> None: ... diff --git a/typings/magika/cli/magika_python_module_tester.pyi b/typings/magika/cli/magika_python_module_tester.pyi new file mode 100644 index 00000000..45c7c470 --- /dev/null +++ b/typings/magika/cli/magika_python_module_tester.pyi @@ -0,0 +1,15 @@ +from _typeshed import Incomplete +from magika import Magika as Magika, MagikaError as MagikaError, PredictionMode as PredictionMode, colors as colors +from magika.logger import get_logger as get_logger +from magika.types import ContentTypeLabel as ContentTypeLabel, MagikaResult as MagikaResult, Status as Status, StatusOr as StatusOr +from pathlib import Path + +VERSION: Incomplete +CONTACT_EMAIL: str +CONTEXT_SETTINGS: Incomplete +HELP_EPILOG: Incomplete + +def main(file: list[Path], recursive: bool, json_output: bool, jsonl_output: bool, mime_output: bool, label_output: bool, magic_compatibility_mode: bool, output_score: bool, prediction_mode_str: str, batch_size: int, no_dereference: bool, with_colors: bool, verbose: bool, debug: bool, dump_performance_stats_flag: bool, output_version: bool, model_dir: Path | None) -> None: ... +def should_read_from_stdin(files_paths: list[Path]) -> bool: ... +def get_magika_result_from_stdin(magika: Magika) -> StatusOr[MagikaResult]: ... +def path_and_result_to_dict(file_path: Path, result: StatusOr[MagikaResult]) -> dict: ... diff --git a/typings/magika/content_types.pyi b/typings/magika/content_types.pyi deleted file mode 100644 index 5f376e95..00000000 --- a/typings/magika/content_types.pyi +++ /dev/null @@ -1,72 +0,0 @@ -from _typeshed import Incomplete -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional - -CONTENT_TYPES_CONFIG_PATH: Incomplete - -class ContentType: - UNKNOWN: str - UNKNOWN_MIME_TYPE: str - UNKNOWN_CONTENT_TYPE_GROUP: str - UNKNOWN_MAGIC: str - UNKNOWN_DESCRIPTION: str - UNSUPPORTED: str - ERROR: str - MISSING: str - EMPTY: str - CORRUPTED: str - TIMEOUT: str - NOT_VALID: str - FILE_DOES_NOT_EXIST: str - PERMISSION_ERROR: str - DIRECTORY: str - SYMLINK: str - GENERIC_TEXT: str - name: Incomplete - extensions: Incomplete - mime_type: Incomplete - group: Incomplete - magic: Incomplete - description: Incomplete - vt_type: Incomplete - datasets: Incomplete - parent: Incomplete - tags: Incomplete - model_target_label: Incomplete - target_label: Incomplete - correct_labels: Incomplete - in_scope_for_output_content_type: Incomplete - def __init__(self, name: str, extensions: List[str], mime_type: Optional[str], group: Optional[str], magic: Optional[str], description: Optional[str], vt_type: Optional[str], datasets: List[str], parent: Optional[str], tags: List[str], model_target_label: Optional[str], target_label: Optional[str], correct_labels: List[str], in_scope_for_output_content_type: bool, add_automatic_tags: bool = True) -> None: ... - @property - def is_text(self) -> bool: ... - @property - def in_scope_for_training(self) -> bool: ... - def to_dict(self) -> Dict[str, Any]: ... - @staticmethod - def from_dict(info_d: Dict, add_automatic_tags: bool = True) -> ContentType: ... - -class ContentTypesManager: - SPECIAL_CONTENT_TYPES: List[str] - SUPPORTED_TARGET_LABELS_SPEC: Incomplete - cts: Incomplete - tag2cts: Incomplete - ext2cts: Incomplete - def __init__(self, content_type_config_path: Path = ..., add_automatic_tags: bool = True) -> None: ... - def load_content_types_info(self, content_type_config_path: Path, add_automatic_tags: bool = True) -> None: ... - def get(self, content_type_name: str) -> Optional[ContentType]: ... - def get_or_raise(self, content_type_name: Optional[str]) -> ContentType: ... - def get_mime_type(self, content_type_name: str, default: str = ...) -> str: ... - def get_group(self, content_type_name: str, default: str = ...) -> str: ... - def get_magic(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ... - def get_description(self, content_type_name: str, default: str = ..., fallback_to_label: bool = True) -> str: ... - def get_cts_by_ext(self, ext: str) -> List[ContentType]: ... - def get_cts_by_ext_or_raise(self, ext: str) -> List[ContentType]: ... - def get_valid_tags(self, only_explicit: bool = True) -> List[str]: ... - def is_valid_ct_label(self, label: str) -> bool: ... - def is_valid_tag(self, tag: str) -> bool: ... - def select(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[ContentType]: ... - def select_names(self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True) -> List[str]: ... - def get_content_types_space(self) -> List[str]: ... - def get_output_content_types(self) -> List[ContentType]: ... - def get_output_content_types_names(self) -> List[str]: ... - def get_invalid_labels(self, labels: Iterable[str]) -> List[str]: ... diff --git a/typings/magika/magika.pyi b/typings/magika/magika.pyi index c9873c78..5c6a0009 100644 --- a/typings/magika/magika.pyi +++ b/typings/magika/magika.pyi @@ -1,17 +1,18 @@ -from magika.content_types import ContentType as ContentType, ContentTypesManager as ContentTypesManager from magika.logger import get_logger as get_logger -from magika.prediction_mode import PredictionMode as PredictionMode -from magika.types import MagikaOutputFields as MagikaOutputFields, MagikaResult as MagikaResult, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, ModelOutputFields as ModelOutputFields +from magika.seekable import Buffer as Buffer, File as File, Seekable as Seekable +from magika.types import ContentTypeInfo as ContentTypeInfo, ContentTypeLabel as ContentTypeLabel, MagikaResult as MagikaResult, ModelConfig as ModelConfig, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput, PredictionMode as PredictionMode, Status as Status, StatusOr as StatusOr from pathlib import Path -from typing import List, Optional + +DEFAULT_MODEL_NAME: str class Magika: - def __init__(self, model_dir: Optional[Path] = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ... - def identify_path(self, path: Path) -> MagikaResult: ... - def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: ... - def identify_bytes(self, content: bytes) -> MagikaResult: ... + def __init__(self, model_dir: Path | None = None, prediction_mode: PredictionMode = ..., no_dereference: bool = False, verbose: bool = False, debug: bool = False, use_colors: bool = False) -> None: ... + def identify_path(self, path: Path) -> StatusOr[MagikaResult]: ... + def identify_paths(self, paths: list[Path]) -> list[StatusOr[MagikaResult]]: ... + def identify_bytes(self, content: bytes) -> StatusOr[MagikaResult]: ... @staticmethod def get_default_model_name() -> str: ... - def get_model_name(self) -> str: ... + def get_model_dir_name(self) -> str: ... + def dump_performance_stats(self) -> None: ... class MagikaError(Exception): ... diff --git a/typings/magika/prediction_mode.pyi b/typings/magika/prediction_mode.pyi deleted file mode 100644 index f3ec6210..00000000 --- a/typings/magika/prediction_mode.pyi +++ /dev/null @@ -1,10 +0,0 @@ -from _typeshed import Incomplete -from magika.strenum import LowerCaseStrEnum as LowerCaseStrEnum -from typing import List - -class PredictionMode(LowerCaseStrEnum): - BEST_GUESS: Incomplete - MEDIUM_CONFIDENCE: Incomplete - HIGH_CONFIDENCE: Incomplete - @staticmethod - def get_valid_prediction_modes() -> List[str]: ... diff --git a/typings/magika/seekable.pyi b/typings/magika/seekable.pyi new file mode 100644 index 00000000..3ad71f40 --- /dev/null +++ b/typings/magika/seekable.pyi @@ -0,0 +1,19 @@ +import abc +from pathlib import Path + +class Seekable(abc.ABC, metaclass=abc.ABCMeta): + def __init__(self) -> None: ... + @property + def size(self) -> int: ... + @abc.abstractmethod + def read_at(self, offset: int, size: int) -> bytes: ... + def close(self) -> None: ... + +class File(Seekable): + def __init__(self, path: Path) -> None: ... + def read_at(self, offset: int, size: int) -> bytes: ... + def close(self) -> None: ... + +class Buffer(Seekable): + def __init__(self, buffer: bytes) -> None: ... + def read_at(self, offset: int, size: int) -> bytes: ... diff --git a/typings/magika/types.pyi b/typings/magika/types.pyi deleted file mode 100644 index c55fbe89..00000000 --- a/typings/magika/types.pyi +++ /dev/null @@ -1,49 +0,0 @@ -from dataclasses import dataclass -from typing import List, Optional - -@dataclass -class ModelFeatures: - beg: List[int] - mid: List[int] - end: List[int] - def __init__(self, beg, mid, end) -> None: ... - -@dataclass -class ModelOutput: - ct_label: str - score: float - def __init__(self, ct_label, score) -> None: ... - -@dataclass -class MagikaResult: - path: str - dl: ModelOutputFields - output: MagikaOutputFields - def __init__(self, path, dl, output) -> None: ... - -@dataclass -class ModelOutputFields: - ct_label: Optional[str] - score: Optional[float] - group: Optional[str] - mime_type: Optional[str] - magic: Optional[str] - description: Optional[str] - def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ... - -@dataclass -class MagikaOutputFields: - ct_label: str - score: float - group: str - mime_type: str - magic: str - description: str - def __init__(self, ct_label, score, group, mime_type, magic, description) -> None: ... - -@dataclass -class FeedbackReport: - hash: str - features: ModelFeatures - result: MagikaResult - def __init__(self, hash, features, result) -> None: ... diff --git a/typings/magika/types/__init__.pyi b/typings/magika/types/__init__.pyi new file mode 100644 index 00000000..0be68170 --- /dev/null +++ b/typings/magika/types/__init__.pyi @@ -0,0 +1,9 @@ +from magika.types.content_type_info import ContentTypeInfo as ContentTypeInfo +from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel +from magika.types.magika_result import MagikaResult as MagikaResult +from magika.types.model import ModelConfig as ModelConfig, ModelFeatures as ModelFeatures, ModelOutput as ModelOutput +from magika.types.prediction_mode import PredictionMode as PredictionMode +from magika.types.status import Status as Status +from magika.types.statusor import StatusOr as StatusOr + +__all__ = ['ContentTypeInfo', 'ContentTypeLabel', 'MagikaResult', 'ModelConfig', 'ModelFeatures', 'ModelOutput', 'PredictionMode', 'Status', 'StatusOr'] diff --git a/typings/magika/types/content_type_info.pyi b/typings/magika/types/content_type_info.pyi new file mode 100644 index 00000000..db078591 --- /dev/null +++ b/typings/magika/types/content_type_info.pyi @@ -0,0 +1,12 @@ +from dataclasses import dataclass +from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel + +@dataclass(frozen=True) +class ContentTypeInfo: + label: ContentTypeLabel + mime_type: str + group: str + description: str + extensions: list[str] + is_text: bool + def __init__(self, label, mime_type, group, description, extensions, is_text) -> None: ... diff --git a/typings/magika/types/content_type_label.pyi b/typings/magika/types/content_type_label.pyi new file mode 100644 index 00000000..aee4302b --- /dev/null +++ b/typings/magika/types/content_type_label.pyi @@ -0,0 +1,347 @@ +from magika.types.strenum import StrEnum as StrEnum + +class ContentTypeLabel(StrEnum): + ABNF = 'abnf' + ACE = 'ace' + ADA = 'ada' + AFF = 'aff' + AI = 'ai' + AIDL = 'aidl' + ALGOL68 = 'algol68' + ANI = 'ani' + APK = 'apk' + APPLEBPLIST = 'applebplist' + APPLEDOUBLE = 'appledouble' + APPLEPLIST = 'appleplist' + APPLESINGLE = 'applesingle' + AR = 'ar' + ARC = 'arc' + ARJ = 'arj' + ARROW = 'arrow' + ASC = 'asc' + ASD = 'asd' + ASF = 'asf' + ASM = 'asm' + ASP = 'asp' + AUTOHOTKEY = 'autohotkey' + AUTOIT = 'autoit' + AVI = 'avi' + AVIF = 'avif' + AVRO = 'avro' + AWK = 'awk' + AX = 'ax' + BATCH = 'batch' + BAZEL = 'bazel' + BCAD = 'bcad' + BIB = 'bib' + BMP = 'bmp' + BPG = 'bpg' + BPL = 'bpl' + BRAINFUCK = 'brainfuck' + BRF = 'brf' + BZIP = 'bzip' + BZIP3 = 'bzip3' + C = 'c' + CAB = 'cab' + CAD = 'cad' + CAT = 'cat' + CDF = 'cdf' + CHM = 'chm' + CLOJURE = 'clojure' + CMAKE = 'cmake' + COBOL = 'cobol' + COFF = 'coff' + COFFEESCRIPT = 'coffeescript' + COM = 'com' + CPL = 'cpl' + CPP = 'cpp' + CRT = 'crt' + CRX = 'crx' + CS = 'cs' + CSPROJ = 'csproj' + CSS = 'css' + CSV = 'csv' + CTL = 'ctl' + DART = 'dart' + DEB = 'deb' + DEX = 'dex' + DEY = 'dey' + DICOM = 'dicom' + DIFF = 'diff' + DIRECTORY = 'directory' + DJANGO = 'django' + DLL = 'dll' + DM = 'dm' + DMG = 'dmg' + DMIGD = 'dmigd' + DMSCRIPT = 'dmscript' + DOC = 'doc' + DOCKERFILE = 'dockerfile' + DOCX = 'docx' + DOSMBR = 'dosmbr' + DOTX = 'dotx' + DSSTORE = 'dsstore' + DWG = 'dwg' + DXF = 'dxf' + DYLIB = 'dylib' + EBML = 'ebml' + ELF = 'elf' + ELIXIR = 'elixir' + EMF = 'emf' + EML = 'eml' + EMPTY = 'empty' + EPUB = 'epub' + ERB = 'erb' + ERLANG = 'erlang' + ESE = 'ese' + EXE = 'exe' + EXP = 'exp' + FLAC = 'flac' + FLUTTER = 'flutter' + FLV = 'flv' + FORTRAN = 'fortran' + FPX = 'fpx' + GEMFILE = 'gemfile' + GEMSPEC = 'gemspec' + GIF = 'gif' + GITATTRIBUTES = 'gitattributes' + GITMODULES = 'gitmodules' + GLEAM = 'gleam' + GO = 'go' + GPX = 'gpx' + GRADLE = 'gradle' + GROOVY = 'groovy' + GZIP = 'gzip' + H = 'h' + H5 = 'h5' + HANDLEBARS = 'handlebars' + HASKELL = 'haskell' + HCL = 'hcl' + HEIF = 'heif' + HFS = 'hfs' + HLP = 'hlp' + HPP = 'hpp' + HTA = 'hta' + HTACCESS = 'htaccess' + HTML = 'html' + HVE = 'hve' + HWP = 'hwp' + ICC = 'icc' + ICNS = 'icns' + ICO = 'ico' + ICS = 'ics' + IGNOREFILE = 'ignorefile' + IMG = 'img' + INI = 'ini' + INTERNETSHORTCUT = 'internetshortcut' + IOSAPP = 'iosapp' + IPYNB = 'ipynb' + ISO = 'iso' + JAR = 'jar' + JAVA = 'java' + JAVABYTECODE = 'javabytecode' + JAVASCRIPT = 'javascript' + JINJA = 'jinja' + JNG = 'jng' + JNLP = 'jnlp' + JP2 = 'jp2' + JPEG = 'jpeg' + JSON = 'json' + JSONC = 'jsonc' + JSONL = 'jsonl' + JSX = 'jsx' + JULIA = 'julia' + JXL = 'jxl' + KO = 'ko' + KOTLIN = 'kotlin' + KS = 'ks' + LATEX = 'latex' + LATEXAUX = 'latexaux' + LESS = 'less' + LHA = 'lha' + LICENSE = 'license' + LISP = 'lisp' + LITCS = 'litcs' + LNK = 'lnk' + LOCK = 'lock' + LRZ = 'lrz' + LUA = 'lua' + LZ = 'lz' + LZ4 = 'lz4' + LZX = 'lzx' + M3U = 'm3u' + M4 = 'm4' + MACHO = 'macho' + MAFF = 'maff' + MAKEFILE = 'makefile' + MARKDOWN = 'markdown' + MATLAB = 'matlab' + MHT = 'mht' + MIDI = 'midi' + MKV = 'mkv' + MP2 = 'mp2' + MP3 = 'mp3' + MP4 = 'mp4' + MPEGTS = 'mpegts' + MSCOMPRESS = 'mscompress' + MSI = 'msi' + MSIX = 'msix' + MST = 'mst' + MUI = 'mui' + MUM = 'mum' + MUN = 'mun' + NIM = 'nim' + NPY = 'npy' + NPZ = 'npz' + NULL = 'null' + NUPKG = 'nupkg' + OBJECT = 'object' + OBJECTIVEC = 'objectivec' + OCAML = 'ocaml' + OCX = 'ocx' + ODEX = 'odex' + ODIN = 'odin' + ODP = 'odp' + ODS = 'ods' + ODT = 'odt' + OGG = 'ogg' + OLE = 'ole' + ONE = 'one' + ONNX = 'onnx' + OOXML = 'ooxml' + OTF = 'otf' + OUTLOOK = 'outlook' + PALMOS = 'palmos' + PARQUET = 'parquet' + PASCAL = 'pascal' + PBM = 'pbm' + PCAP = 'pcap' + PDB = 'pdb' + PDF = 'pdf' + PEBIN = 'pebin' + PEM = 'pem' + PERL = 'perl' + PGP = 'pgp' + PHP = 'php' + PICKLE = 'pickle' + PNG = 'png' + PO = 'po' + POSTSCRIPT = 'postscript' + POWERSHELL = 'powershell' + PPT = 'ppt' + PPTX = 'pptx' + PRINTFOX = 'printfox' + PROLOG = 'prolog' + PROTEINDB = 'proteindb' + PROTO = 'proto' + PROTOBUF = 'protobuf' + PSD = 'psd' + PUB = 'pub' + PYTHON = 'python' + PYTHONBYTECODE = 'pythonbytecode' + PYTHONPAR = 'pythonpar' + PYTORCH = 'pytorch' + QOI = 'qoi' + QT = 'qt' + R = 'r' + RANDOMASCII = 'randomascii' + RANDOMBYTES = 'randombytes' + RAR = 'rar' + RDF = 'rdf' + RIFF = 'riff' + RLIB = 'rlib' + RLL = 'rll' + RPM = 'rpm' + RST = 'rst' + RTF = 'rtf' + RUBY = 'ruby' + RUST = 'rust' + RZIP = 'rzip' + SCALA = 'scala' + SCHEME = 'scheme' + SCR = 'scr' + SCRIPTWSF = 'scriptwsf' + SCSS = 'scss' + SEVENZIP = 'sevenzip' + SGML = 'sgml' + SH3D = 'sh3d' + SHELL = 'shell' + SMALI = 'smali' + SNAP = 'snap' + SO = 'so' + SOLIDITY = 'solidity' + SQL = 'sql' + SQLITE = 'sqlite' + SQUASHFS = 'squashfs' + SRT = 'srt' + STLBINARY = 'stlbinary' + STLTEXT = 'stltext' + SUM = 'sum' + SVD = 'svd' + SVG = 'svg' + SWF = 'swf' + SWIFT = 'swift' + SYMLINK = 'symlink' + SYMLINKTEXT = 'symlinktext' + SYS = 'sys' + TAR = 'tar' + TCL = 'tcl' + TEXTPROTO = 'textproto' + TGA = 'tga' + THUMBSDB = 'thumbsdb' + TIFF = 'tiff' + TMDX = 'tmdx' + TOML = 'toml' + TORRENT = 'torrent' + TROFF = 'troff' + TSV = 'tsv' + TSX = 'tsx' + TTF = 'ttf' + TWIG = 'twig' + TXT = 'txt' + TXTASCII = 'txtascii' + TXTUTF16 = 'txtutf16' + TXTUTF8 = 'txtutf8' + TYPESCRIPT = 'typescript' + UDF = 'udf' + UNDEFINED = 'undefined' + UNIXCOMPRESS = 'unixcompress' + UNKNOWN = 'unknown' + VBA = 'vba' + VBE = 'vbe' + VCARD = 'vcard' + VCS = 'vcs' + VCXPROJ = 'vcxproj' + VERILOG = 'verilog' + VHD = 'vhd' + VHDL = 'vhdl' + VISIO = 'visio' + VTT = 'vtt' + VUE = 'vue' + WAD = 'wad' + WASM = 'wasm' + WAV = 'wav' + WEBM = 'webm' + WEBP = 'webp' + WIM = 'wim' + WINREGISTRY = 'winregistry' + WMA = 'wma' + WMF = 'wmf' + WMV = 'wmv' + WOFF = 'woff' + WOFF2 = 'woff2' + XAR = 'xar' + XCF = 'xcf' + XLS = 'xls' + XLSB = 'xlsb' + XLSX = 'xlsx' + XML = 'xml' + XPI = 'xpi' + XSD = 'xsd' + XZ = 'xz' + YAML = 'yaml' + YARA = 'yara' + ZIG = 'zig' + ZIP = 'zip' + ZLIBSTREAM = 'zlibstream' + ZST = 'zst' diff --git a/typings/magika/types/magika_result.pyi b/typings/magika/types/magika_result.pyi new file mode 100644 index 00000000..68229985 --- /dev/null +++ b/typings/magika/types/magika_result.pyi @@ -0,0 +1,9 @@ +from dataclasses import dataclass +from magika.types.content_type_info import ContentTypeInfo as ContentTypeInfo + +@dataclass(frozen=True) +class MagikaResult: + dl: ContentTypeInfo + output: ContentTypeInfo + score: float + def __init__(self, dl, output, score) -> None: ... diff --git a/typings/magika/types/model.pyi b/typings/magika/types/model.pyi new file mode 100644 index 00000000..d00ca2a4 --- /dev/null +++ b/typings/magika/types/model.pyi @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from magika.types.content_type_label import ContentTypeLabel as ContentTypeLabel + +@dataclass(frozen=True) +class ModelFeatures: + beg: list[int] + mid: list[int] + end: list[int] + offset_0x8000_0x8007: list[int] + offset_0x8800_0x8807: list[int] + offset_0x9000_0x9007: list[int] + offset_0x9800_0x9807: list[int] + def __init__(self, beg, mid, end, offset_0x8000_0x8007, offset_0x8800_0x8807, offset_0x9000_0x9007, offset_0x9800_0x9807) -> None: ... + +@dataclass(frozen=True) +class ModelOutput: + ct_label: ContentTypeLabel + score: float + def __init__(self, ct_label, score) -> None: ... + +@dataclass(frozen=True) +class ModelConfig: + beg_size: int + mid_size: int + end_size: int + use_inputs_at_offsets: bool + medium_confidence_threshold: float + min_file_size_for_dl: int + padding_token: int + block_size: int + target_labels_space: list[ContentTypeLabel] + thresholds: dict[ContentTypeLabel, float] + overwrite_map: dict[ContentTypeLabel, ContentTypeLabel] + def __init__(self, beg_size, mid_size, end_size, use_inputs_at_offsets, medium_confidence_threshold, min_file_size_for_dl, padding_token, block_size, target_labels_space, thresholds, overwrite_map) -> None: ... diff --git a/typings/magika/types/prediction_mode.pyi b/typings/magika/types/prediction_mode.pyi new file mode 100644 index 00000000..83879311 --- /dev/null +++ b/typings/magika/types/prediction_mode.pyi @@ -0,0 +1,8 @@ +from magika.types.strenum import LowerCaseStrEnum as LowerCaseStrEnum + +class PredictionMode(LowerCaseStrEnum): + BEST_GUESS = ... + MEDIUM_CONFIDENCE = ... + HIGH_CONFIDENCE = ... + @staticmethod + def get_valid_prediction_modes() -> list[str]: ... diff --git a/typings/magika/types/status.pyi b/typings/magika/types/status.pyi new file mode 100644 index 00000000..642b1d31 --- /dev/null +++ b/typings/magika/types/status.pyi @@ -0,0 +1,7 @@ +from magika.types.strenum import StrEnum as StrEnum + +class Status(StrEnum): + OK = 'ok' + FILE_NOT_FOUND_ERROR = 'file_not_found_error' + PERMISSION_ERROR = 'permission_error' + UNKNOWN = 'unknown' diff --git a/typings/magika/types/statusor.pyi b/typings/magika/types/statusor.pyi new file mode 100644 index 00000000..00819304 --- /dev/null +++ b/typings/magika/types/statusor.pyi @@ -0,0 +1,14 @@ +from magika.types.status import Status as Status +from typing import Generic, TypeVar + +T = TypeVar('T') + +class StatusOr(Generic[T]): + def __init__(self, *, status: Status = ..., value: T | None = None) -> None: ... + def __post_init__(self) -> None: ... + @property + def ok(self) -> bool: ... + @property + def status(self) -> Status: ... + @property + def value(self) -> T: ... diff --git a/typings/magika/strenum.pyi b/typings/magika/types/strenum.pyi similarity index 96% rename from typings/magika/strenum.pyi rename to typings/magika/types/strenum.pyi index 05cab4e5..c74e90fe 100644 --- a/typings/magika/strenum.pyi +++ b/typings/magika/types/strenum.pyi @@ -1,6 +1,6 @@ -import enum - -class StrEnum(str, enum.Enum): - def __new__(cls, value: str | StrEnum, *args, **kwargs): ... - -class LowerCaseStrEnum(StrEnum): ... +import enum + +class StrEnum(str, enum.Enum): + def __new__(cls, value: str | StrEnum, *args, **kwargs): ... + +class LowerCaseStrEnum(StrEnum): ...