Skip to content

Commit

Permalink
feat: experimental "Magika" syntax detect method
Browse files Browse the repository at this point in the history
  • Loading branch information
jfcherng committed Feb 18, 2024
1 parent 26a6fef commit eadf8b9
Show file tree
Hide file tree
Showing 23 changed files with 567 additions and 4 deletions.
59 changes: 59 additions & 0 deletions AutoSetSyntax.sublime-settings
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,65 @@
}
],

/////////////////////
// Magika settings //
/////////////////////

// To use this feature, you have to install the "magika" library.
// @see https://jfcherng-sublime.github.io/ST-AutoSetSyntax/dl-based-syntax-detection/#prerequisites
"magika.enabled": false,
"magika.min_confidence": 0.85,
// @see https://github.com/google/magika/blob/9e733e847ea0d93ea100d5d478a4b54c3ec5fd1c/docs/supported-content-types-list.md
"magika.syntax_map": {
"asm": [
// no good way to do this?
"scope:source.asm.x86_64",
"scope:source.asm.arm",
"scope:source.rvasm",
"scope:source.assembly"
],
"asp": ["scope:source.asp"],
"batch": ["scope:source.dosbatch"],
"c": [
// C++ is basically a superset and magika can't distinguish C and C++
"scope:source.c++"
],
"cs": ["scope:source.cs"],
"css": ["scope:source.scss", "scope:source.css"],
"csv": ["scope:text.advanced_csv", "scope:text.csv"],
"go": ["scope:source.go"],
"html": ["scope:text.html.basic"],
"ini": ["scope:source.ini"],
"java": ["scope:source.java"],
"javascript": ["scope:source.js"],
"json": ["scope:source.json"],
"latex": ["scope:text.tex.latex"],
"lisp": ["scope:source.lisp"],
"m3u": ["scope:text.m3u"],
"makefile": ["scope:source.makefile"],
"markdown": ["scope:text.html.markdown"],
"mum": ["=xml"],
"perl": ["scope:source.perl"],
"php": ["scope:embedding.php", "scope:text.html.php"],
"postscript": ["scope:source.postscript"],
"powershell": ["scope:source.powershell"],
"python": ["scope:source.python"],
"rdf": ["=xml"],
"rst": ["scope:text.restructuredtext"],
"rtf": ["scope:text.rtf"],
"ruby": ["scope:source.ruby"],
"rust": ["scope:source.rust"],
"scala": ["scope:source.scala"],
"shell": ["scope:source.shell.bash"],
"smali": ["scope:source.smali"],
"sql": ["scope:source.sql"],
"svg": ["=xml"],
"vba": ["scope:source.vbs"],
"winregistry": ["scope:source.reg"],
"xml": ["scope:text.xml"],
"yaml": ["scope:source.yaml"]
},

///////////////////////////////////////
// Guesslang settings (experimental) //
/////////////////////////////////////////////////////////////////////////
Expand Down
12 changes: 12 additions & 0 deletions docs/src/configurations.md
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,18 @@ To edit project settings, go to `Project` » `Edit Project`.

Available comparators are: `<`, `<=`, `==`, `>=`, `>` and `!=`.

#### `is_magika_enabled`

!!! example

```js
{
"constraint": "is_magika_enabled",
}
```

Test whether the `magika` is enabled.

#### `is_name`

!!! example
Expand Down
32 changes: 32 additions & 0 deletions docs/src/experimental/dl-based-syntax-detection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Deep Learning Based Syntax Detection

--8<-- "refs.md"

!!! warning "This feature is experimental and disabled by default."

## Overview

It uses Google's [Magika](https://github.com/google/magika) library to detect the file syntax.

## Prerequisites

1. If you are using MacOS, the minimum supported OS version is MacOS 11 (Big Sur).

1. Install dependencies.

Run `AutoSetSyntax: Download Dependencies` from the command palette.
The dependencies can be up to 50 MB in size, so it may take a while.
When it's done, there will be a popup dialogue.

1. Enable the feature.

Set `"magika.enabled"` to `true` in AutoSetSyntax's settings.

After you've done all steps above, it should just work without restarting Sublime Text.

## Demo

<video controls="controls" style="max-width:100%">
<source type="video/mp4" src="https://user-images.githubusercontent.com/6594915/133069990-ea6eaf22-f341-4c0c-9b74-1931f96c7183.mp4"></source>
<p>Your browser does not support the video element.</p>
</video>
2 changes: 1 addition & 1 deletion docs/src/experimental/ml-based-syntax-detection.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

--8<-- "refs.md"

!!! warning "This feature is experimental and disabled by default."
!!! warning "This feature has been deprecated and will be removed in AutoSetSyntax v3."

## Overview

Expand Down
4 changes: 4 additions & 0 deletions menus/Default.sublime-commands
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
"caption": "AutoSetSyntax: Toggle Log Panel",
"command": "auto_set_syntax_toggle_log_panel",
},
{
"caption": "AutoSetSyntax: Download Dependencies",
"command": "auto_set_syntax_download_dependencies",
},
{
"caption": "AutoSetSyntax: Download Guesslang Server",
"command": "auto_set_syntax_download_guesslang_server",
Expand Down
11 changes: 10 additions & 1 deletion plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import importlib.machinery
import pkgutil
import sys
from pathlib import Path

import sublime
Expand All @@ -11,11 +12,12 @@
AutoSetSyntaxCreateNewConstraintCommand,
AutoSetSyntaxCreateNewMatchCommand,
AutoSetSyntaxDebugInformationCommand,
AutoSetSyntaxDownloadDependenciesCommand,
AutoSetSyntaxDownloadGuesslangServerCommand,
AutoSetSyntaxRestartGuesslangCommand,
run_auto_set_syntax_on_view,
)
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME
from .constants import PLUGIN_CUSTOM_MODULE_PATHS, PLUGIN_NAME, PLUGIN_PY_LIBS_DIR
from .listener import (
AutoSetSyntaxEventListener,
AutoSetSyntaxTextChangeListener,
Expand Down Expand Up @@ -48,6 +50,7 @@
"AutoSetSyntaxCreateNewConstraintCommand",
"AutoSetSyntaxCreateNewMatchCommand",
"AutoSetSyntaxDebugInformationCommand",
"AutoSetSyntaxDownloadDependenciesCommand",
"AutoSetSyntaxDownloadGuesslangServerCommand",
"AutoSetSyntaxRestartGuesslangCommand",
# ST: listeners
Expand All @@ -67,6 +70,7 @@ def plugin_loaded() -> None:


def _plugin_loaded() -> None:
_add_python_lib_path()
_load_custom_implementations()

AioSettings.plugin_name = PLUGIN_NAME
Expand Down Expand Up @@ -98,6 +102,11 @@ def _settings_changed_callback(window: sublime.Window) -> None:
compile_rules(window, is_update=True)


def _add_python_lib_path() -> None:
if (path := str(PLUGIN_PY_LIBS_DIR)) not in sys.path:
sys.path.insert(0, path)


def _load_custom_implementations() -> None:
for finder, name, _ in pkgutil.iter_modules(map(str, PLUGIN_CUSTOM_MODULE_PATHS.values())):
assert isinstance(finder, importlib.machinery.FileFinder)
Expand Down
2 changes: 2 additions & 0 deletions plugin/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
AutoSetSyntaxCreateNewMatchCommand,
)
from .auto_set_syntax_debug_information import AutoSetSyntaxDebugInformationCommand
from .auto_set_syntax_download_dependencies import AutoSetSyntaxDownloadDependenciesCommand
from .auto_set_syntax_download_guesslang_server import AutoSetSyntaxDownloadGuesslangServerCommand
from .auto_set_syntax_restart_guesslang import AutoSetSyntaxRestartGuesslangCommand

Expand All @@ -13,6 +14,7 @@
"AutoSetSyntaxCreateNewConstraintCommand",
"AutoSetSyntaxCreateNewMatchCommand",
"AutoSetSyntaxDebugInformationCommand",
"AutoSetSyntaxDownloadDependenciesCommand",
"AutoSetSyntaxDownloadGuesslangServerCommand",
"AutoSetSyntaxRestartGuesslangCommand",
# ...
Expand Down
57 changes: 56 additions & 1 deletion plugin/commands/auto_set_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..constants import PLUGIN_NAME, RE_ST_SYNTAX_TEST_LINE, RE_VIM_SYNTAX_LINE
from ..guesslang.types import GuesslangServerPredictionItem, GuesslangServerResponse
from ..helpers import is_syntaxable_view
from ..helpers import is_syntaxable_view, resolve_magika_label_with_syntax_map
from ..libs import websocket
from ..logger import Logger
from ..rules import SyntaxRuleCollection
Expand Down Expand Up @@ -194,6 +194,17 @@ def run_auto_set_syntax_on_view(
} and _assign_syntax_with_trimmed_filename(view, event):
return True

if event in {
ListenerEvent.COMMAND,
ListenerEvent.INIT,
ListenerEvent.LOAD,
ListenerEvent.MODIFY,
ListenerEvent.PASTE,
ListenerEvent.SAVE,
ListenerEvent.UNTRANSIENTIZE,
} and _assign_syntax_with_magika(view, event):
return True

if _assign_syntax_with_heuristics(view, event):
return True

Expand Down Expand Up @@ -374,6 +385,50 @@ def is_json(view: sublime.View) -> bool:
return False


def _assign_syntax_with_magika(view: sublime.View, event: ListenerEvent | None = None) -> bool:
if not (
(window := view.window())
and (settings := get_merged_plugin_settings(window=window))
and settings.get("magika.enabled")
and (view_snapshot := G.view_snapshot_collection.get_by_view(view))
# don't apply on those have an extension
and (event == ListenerEvent.COMMAND or "." not in view_snapshot.file_name_unhidden)
# only apply on plain text syntax
and ((syntax := view_snapshot.syntax) and is_plaintext_syntax(syntax))
# we don't want to use AI model during typing when there is only one line
# that may result in unwanted behavior such as a new buffer may be assigned to Python
# right after "import" is typed but it could be JavaScript or TypeScript as well
and (event != ListenerEvent.MODIFY or "\n" in view_snapshot.content)
):
return False

try:
from magika import Magika
except ImportError as e:
Logger.log(f"💣 Error occured when importing Magika: {e}", window=window)
return False

classifier = Magika()
output = classifier.identify_bytes(view_snapshot.content.encode()).output
Logger.log(f"🐛 Magika's prediction: {output}", window=window)

threadshold: float = settings.get("magika.min_confidence", 0.0)
if output.score < threadshold or output.ct_label in {"directory", "empty", "txt", "unknown"}:
return False

syntax_map: dict[str, list[str]] = settings.get("magika.syntax_map", {})
if not (syntax_likes := resolve_magika_label_with_syntax_map(output.ct_label, syntax_map)):
Logger.log(f'🤔 Unknown "label" from Magika: {output.ct_label}', window=window)
return False

if not (syntax := find_syntax_by_syntax_likes(syntax_likes, include_plaintext=False)):
Logger.log(f"😢 Failed finding syntax from Magika: {syntax_likes}", window=window)
return False

sublime.status_message(f"Predicted syntax: {output.ct_label} ({round(output.score * 100, 2)}% confidence)")
return assign_syntax_to_view(view, syntax, details={"event": event, "reason": "Magika (Deep Learning)"})


def _assign_syntax_with_guesslang_async(view: sublime.View, event: ListenerEvent | None = None) -> None:
if not (
G.guesslang_client
Expand Down
Loading

0 comments on commit eadf8b9

Please sign in to comment.