togethercomputer · hust-nj · Apr 27, 2023
diff --git a/data_prep/github/github_clean_dedup_local.py b/data_prep/github/github_clean_dedup_local.py
@@ -25,38 +25,46 @@ def get_timestamp() -> str:
 
 
 def clean_copyright_comments(content: str):
-    r = PAT.search(content)
-    if r:
-        # found one, now see if it contains "copyright", if so strip it
-        span = r.span()
-        sub = content[span[0]:span[1]]
-        if CPAT.search(sub):
-            # cut it
-            content = content[: span[0]] + content[span[1]:]
-
-        return content
-
     lines = content.split('\n')
     skip = 0
 
     # Greedy replace any file that begins with comment block, most
     # are copyright headers
     for k in range(len(lines)):
         if (
-                lines[k].startswith("//") or
-                lines[k].startswith("#") or
-                lines[k].startswith("--") or
+                lines[k].lstrip().startswith("//") or
+                lines[k].lstrip().startswith("#") or
+                lines[k].lstrip().startswith("--") or
                 not lines[k]
         ):
             skip = skip + 1
         else:
             break
 
-    if skip:
-        # we skipped, consume it
-        content = "\n".join(lines[skip:])
-
-    return content
+    lines = lines[skip:]
+
+    if len(lines) > 100:
+        top100_line_content = '\n'.join(lines[:100])
+        r = PAT.search(top100_line_content)
+        if r:
+            # found one, now see if it contains "copyright", if so strip it
+            span = r.span()
+            sub = top100_line_content[span[0]:span[1]]
+            if CPAT.search(sub):
+                # cut it
+                top100_line_content = top100_line_content[: span[0]] + top100_line_content[span[1]:]
+        return top100_line_content + '\n' + '\n'.join(lines[100:])
+    else:
+        content = '\n'.join(lines)
+        r = PAT.search(content)
+        if r:
+            # found one, now see if it contains "copyright", if so strip it
+            span = r.span()
+            sub = content[span[0]:span[1]]
+            if CPAT.search(sub):
+                # cut it
+                content = content[: span[0]] + content[span[1]:]
+        return content
 
 
 def get_filecontent_stats(content: str) -> Dict[str, Union[int, str]]: