Fix small words after : and between - at EOL

blakeembrey · Dec 9, 2023 · 2e0c64c · 2e0c64c
1 parent 871ab96
commit 2e0c64c
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 23 deletions.
diff --git a/packages/title-case/src/index.spec.ts b/packages/title-case/src/index.spec.ts
@@ -66,6 +66,17 @@ const TEST_CASES: [string, string][] = [
   ],
   ["лев николаевич толстой", "Лев Николаевич Толстой"],
   ["Read foo-bar.com", "Read foo-bar.com"],
+  ["cowboy bebop: the movie", "Cowboy Bebop: The Movie"],
+  ["a thing. the thing. and more.", "A Thing. The Thing. And More."],
+  ['"a quote." a test.', '"A Quote." A Test.'],
+  ['"The U.N." a quote.', '"The U.N." A Quote.'],
+  ['"The U.N.". a quote.', '"The U.N.". A Quote.'],
+  ['"go without"', '"Go Without"'],
+  ["the iPhone: a quote", "The iPhone: A Quote"],
+  ["the U.N. and me", "The U.N. and Me"],
+  ["start-and-end", "Start-and-End"],
+  ["go-to-iPhone", "Go-to-iPhone"],
+  ["Keep #tag", "Keep #tag"],
 ];
 
 describe("swap case", () => {

diff --git a/packages/title-case/src/index.ts b/packages/title-case/src/index.ts
@@ -1,10 +1,22 @@
-const TOKENS = /\S+|./g;
-const IS_MANUAL_CASE = /\p{Ll}(?=[\p{Lu}])|\.\p{L}/u; // iPhone, example.com, U.N., etc.
+const TOKENS = /(\S+)|(.)/g;
+const IS_SPECIAL_CASE = /[\.#]\p{L}/u; // #tag, example.com, etc.
+const IS_MANUAL_CASE = /\p{Ll}(?=[\p{Lu}])/u; // iPhone, iOS, etc.
 const ALPHANUMERIC_PATTERN = /[\p{L}\d]+/gu;
+const IS_ACRONYM = /(?:\p{Lu}\.){2,}$/u;
 
-const WORD_SEPARATORS = new Set(["—", "–", "-", "―", "/"]);
+export const WORD_SEPARATORS = new Set(["—", "–", "-", "―", "/"]);
 
-const SMALL_WORDS = new Set([
+export const SENTENCE_TERMINATORS = new Set([
+  ".",
+  "!",
+  "?",
+  ":",
+  '"',
+  "'",
+  "”",
+]);
+
+export const SMALL_WORDS = new Set([
   "a",
   "an",
   "and",
@@ -45,6 +57,8 @@ const SMALL_WORDS = new Set([
 
 export interface Options {
   smallWords?: Set<string>;
+  sentenceTerminators?: Set<string>;
+  wordSeparators?: Set<string>;
   locale?: string | string[];
 }
 
@@ -54,38 +68,77 @@ export function titleCase(
 ) {
   let result = "";
   let m: RegExpExecArray | null;
+  let isNewSentence = true;
 
-  const { smallWords = SMALL_WORDS, locale } =
-    typeof options === "string" || Array.isArray(options)
-      ? { locale: options }
-      : options;
+  const {
+    smallWords = SMALL_WORDS,
+    sentenceTerminators = SENTENCE_TERMINATORS,
+    wordSeparators = WORD_SEPARATORS,
+    locale,
+  } = typeof options === "string" || Array.isArray(options)
+    ? { locale: options }
+    : options;
 
   // tslint:disable-next-line
   while ((m = TOKENS.exec(input)) !== null) {
-    const { 0: token, index } = m;
+    const { 1: token, 2: whiteSpace, index } = m;
+
+    if (whiteSpace) {
+      result += whiteSpace;
+      continue;
+    }
 
-    // Ignore already capitalized words.
-    if (IS_MANUAL_CASE.test(token)) {
+    // Ignore URLs, email addresses, acronyms, etc.
+    if (IS_SPECIAL_CASE.test(token)) {
       result += token;
+
+      // The period at the end of an acronym is not a new sentence.
+      if (IS_ACRONYM.test(token)) {
+        isNewSentence = false;
+        continue;
+      }
     } else {
-      result += token.replace(ALPHANUMERIC_PATTERN, (m, i) => {
-        // Ignore small words except at beginning or end.
-        if (
-          index > 0 &&
-          index + token.length < input.length &&
-          smallWords.has(m)
-        ) {
-          return m;
+      const matches = Array.from(token.matchAll(ALPHANUMERIC_PATTERN));
+      let value = token;
+
+      for (let i = 0; i < matches.length; i++) {
+        const { 0: word, index: wordIndex = 0 } = matches[i];
+
+        // Reset "new sentence" when we find a word.
+        if (isNewSentence) {
+          isNewSentence = false;
+        } else {
+          // Ignore small words except at beginning or end,
+          // or previous token is a new sentence.
+          if (
+            smallWords.has(word) &&
+            // Not the final token and word.
+            !(index + token.length === input.length && i === matches.length - 1)
+          ) {
+            continue;
+          }
+        }
+
+        if (IS_MANUAL_CASE.test(word)) {
+          continue;
         }
 
         // Only capitalize words after a valid word separator.
-        if (i > 1 && !WORD_SEPARATORS.has(input.charAt(index + i - 1))) {
-          return m;
+        if (i > 0 && !wordSeparators.has(token.charAt(wordIndex - 1))) {
+          continue;
         }
 
-        return m.charAt(0).toLocaleUpperCase(locale) + m.slice(1);
-      });
+        value =
+          value.slice(0, wordIndex) +
+          value.charAt(wordIndex).toLocaleUpperCase(locale) +
+          value.slice(wordIndex + 1);
+      }
+
+      result += value;
     }
+
+    const lastChar = token.charAt(token.length - 1);
+    isNewSentence = sentenceTerminators.has(lastChar);
   }
 
   return result;