Added the GrammarAutoCorrector

recodehive · Nov 3, 2024 · 4db4269 · 4db4269
1 parent 1f6e3f6
commit 4db4269
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 0 deletions.
diff --git a/NLP/Grammar Auto Corrector/README.md b/NLP/Grammar Auto Corrector/README.md
@@ -0,0 +1,42 @@
+# Grammar AutoCorrector
+
+A powerful Grammar AutoCorrector tool designed to automatically detect and correct grammatical errors in English sentences. This project leverages NLP techniques and the T5 transformer model for advanced grammar correction, making it suitable for applications in writing assistance tools, educational platforms, and beyond.
+
+## Features
+- Preprocessing techniques such as tokenization, lemmatization, stop word removal, and punctuation removal.
+- Training of a grammar correction model using large datasets with grammatically correct sentences.
+- Ability to identify and correct common grammatical errors in sentences.
+
+
+
+## This project uses the following modules:
+
+### Modules Used
+
+1. Transformers
+2. Torch
+3. NLTK
+4. SpaCy
+5. Pandas
+6. NumPy
+7. re (Regular Expressions)
+8. Scikit-Learn
+9. pytest
+10. datasets (Hugging Face)
+11. yaml
+12. tqdm
+
+
+## Data
+Download appropriate grammar correction datasets, such as:
+- [Cambridge English Write & Improve + LOCNESS](https://ilexir.co.uk/datasets/index.html)
+- [Grammarly GEC Dataset](https://www.grammarly.com/research/grammatical-error-correction/)
+- [JFLEG](https://github.com/keisks/jfleg)
+
+
+
+# Connect with Me
+
+- **GitHub**: [Peart-Guy](https://github.com/Peart-Guy)
+- **LinkedIn**: [Ankan Mukhopadhyay](https://www.linkedin.com/in/ankan-mukhopadhyaypeartguy/)
+
diff --git a/NLP/Grammar Auto Corrector/main.py b/NLP/Grammar Auto Corrector/main.py
@@ -0,0 +1,74 @@
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import re
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+from transformers import Trainer, TrainingArguments
+from datasets import load_dataset
+
+# Download NLTK resources
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('averaged_perceptron_tagger')
+
+# Initialize the lemmatizer and stop words list
+lemmatizer = WordNetLemmatizer()
+stop_words = set(stopwords.words('english'))
+
+def preprocess_text(text):
+    # Lowercase the text
+    text = text.lower()
+    # Remove punctuation
+    text = re.sub(r'[^\w\s]', '', text)
+    # Tokenize text
+    tokens = word_tokenize(text)
+    # Lemmatize and remove stop words
+    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
+    return tokens
+
+
+# Load pre-trained T5 model and tokenizer
+model = T5ForConditionalGeneration.from_pretrained('t5-small')
+tokenizer = T5Tokenizer.from_pretrained('t5-small')
+
+dataset = load_dataset("bookcorpus", split="train")  # For BooksCorpus
+wiki_dataset = load_dataset("wikipedia", "20220301.en", split="train")  # For Wikipedia
+
+# Define a training function
+def train_model(dataset):
+    # Tokenize inputs and outputs
+    inputs = tokenizer(["correct: " + text for text in dataset["input_texts"]], return_tensors="pt", padding=True)
+    outputs = tokenizer(["grammar_corrected: " + text for text in dataset["output_texts"]], return_tensors="pt", padding=True)
+
+    # Define Trainer
+    training_args = TrainingArguments(
+        output_dir='./results',          
+        per_device_train_batch_size=4,   
+        num_train_epochs=3,              
+        weight_decay=0.01,               
+    )
+    trainer = Trainer(
+        model=model,                     
+        args=training_args,              
+        train_dataset=dataset            
+    )
+
+    trainer.train()
+
+# Train the model on the processed dataset
+train_model(dataset)
+
+
+
+def correct_grammar(text):
+    input_text = "correct: " + text
+    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+    outputs = model.generate(input_ids)
+    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return corrected_text
+
+# Example usage
+test_sentence = "She go to the market every morning."
+print("Corrected Sentence:", correct_grammar(test_sentence))