diff --git a/lib/zarex.ex b/lib/zarex.ex index add4f50..5ddc769 100644 --- a/lib/zarex.ex +++ b/lib/zarex.ex @@ -7,7 +7,7 @@ defmodule Zarex do It deletes the bad stuff but leaves unicode characters in place, so users can use whatever alphabets they want to. Zarex also doesn't remove whitespaceβ€”instead, any sequence of whitespace that is 1 or more characters in length is collapsed to a - single space. Filenames are truncated so that they are at maximum 255 characters long. + single space. Filenames are truncated so that they are at maximum 255 bytes long. ### Examples @@ -33,7 +33,7 @@ defmodule Zarex do String.trim(name) |> String.replace(~r/[[:space:]]+/u, " ") - |> String.slice(0, 255 - padding) + |> byte_aware_take(255 - padding) |> String.replace(~r/[\x00-\x1F\/\\:\*\?\"<>\|]/u, "") |> String.replace(~r/[[:space:]]+/u, " ") |> filter_windows_reserved_names(filename_fallback) @@ -54,4 +54,26 @@ defmodule Zarex do defp filter_dots(name, fallback) do if String.starts_with?(name, "."), do: "#{fallback}#{name}", else: name end + + defp byte_aware_take(string, limit) do + by_character = String.slice(string, 0, limit) + + if byte_size(by_character) <= limit do + by_character + else + by_character + |> String.graphemes() + |> Enum.reduce_while({0, []}, fn grapheme, {bytes, acc} -> + bytes = bytes + byte_size(grapheme) + + if bytes <= limit do + {:cont, {bytes, [grapheme | acc]}} + else + result = acc |> Enum.reverse() |> Enum.join() + + {:halt, result} + end + end) + end + end end diff --git a/test/zarex_test.exs b/test/zarex_test.exs index 10cb9c4..a243ca9 100644 --- a/test/zarex_test.exs +++ b/test/zarex_test.exs @@ -18,6 +18,16 @@ defmodule ZarexTest do assert String.length(Zarex.sanitize(name, padding: 10)) == 245 end + test "truncation enforces byte limit" do + name = + "πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘01πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘02πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘03πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘04πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘05πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘06πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘07πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘08πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘09πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘10πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘11πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘12πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘13πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘14πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘15πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘16πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘17πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘18πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘19πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘20πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘21πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘22πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘23πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘24πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘πŸ˜„πŸ‘25" + + assert String.length(Zarex.sanitize(name)) == 74 + assert byte_size(Zarex.sanitize(name)) == 254 + assert String.length(Zarex.sanitize(name, padding: 10)) == 71 + assert byte_size(Zarex.sanitize(name, padding: 10)) == 242 + end + test "sanitization" do assert "abcdef" == Zarex.sanitize("abcdef") assert "笊, ざる.pdf" == Zarex.sanitize("笊, ざる.pdf")