Spaces:

asdfasdfdsafdsa
/

czech-gec-punctuation-pipeline

Runtime error

App Files Files Community

asdfasdfdsafdsa commited on Sep 19

Commit

89b7ad2

verified ·

1 Parent(s): 39958b2

Upload 3 files

Browse files

Files changed (3) hide show

README.md +4 -3
app.py +76 -16
requirements.txt +1 -4

README.md CHANGED Viewed

@@ -50,9 +50,10 @@ A comprehensive pipeline that combines grammatical error correction with punctua
   - ByT5-large model fine-tuned on Czech GEC corpus
   - Handles complex grammatical errors
-- **Punctuation**: [1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase)
-  - XLM-RoBERTa for punctuation restoration
-  - Supports capitalization and sentence boundaries
 ## 💡 Use Cases

   - ByT5-large model fine-tuned on Czech GEC corpus
   - Handles complex grammatical errors
+- **Punctuation**: [kredor/punctuate-all](https://huggingface.co/kredor/punctuate-all)
+  - Token classification model for punctuation restoration
+  - Supports Czech and 11 other languages
+  - Adds punctuation marks: . , ? - :
 ## 💡 Use Cases

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from punctuators.models import PunctCapSegModelONNX
 from difflib import SequenceMatcher
 import re
@@ -17,9 +16,9 @@ print(f"GEC model loaded on {device}")
 # Load punctuation model
 print("Loading punctuation model...")
-punct_model = PunctCapSegModelONNX.from_pretrained(
-    "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
-)
 print("Punctuation model loaded!")
 def gec_correct(input_text):
@@ -117,23 +116,84 @@ def gec_correct(input_text):
     return corrections
 def punct_correct(input_text):
-    """Generate 3 different punctuation corrections"""
     if not input_text.strip():
         return ["", "", ""]
     corrections = []
-    # Conservative - no sentence boundaries
-    result = punct_model.infer(texts=[input_text], apply_sbd=False)
-    corrections.append(result[0])
-    # With sentence boundaries
-    result = punct_model.infer(texts=[input_text], apply_sbd=True)
-    corrections.append("\n".join(result[0]) if isinstance(result[0], list) else result[0])
-    # Balanced
-    result = punct_model.infer(texts=[input_text], apply_sbd=False)
-    corrections.append(result[0])
     return corrections
@@ -382,7 +442,7 @@ with gr.Blocks(title="Czech GEC + Punctuation Pipeline", theme=gr.themes.Soft())
     ---
     **Models:**
     - GEC: [ufal/byt5-large-geccc-mate](https://huggingface.co/ufal/byt5-large-geccc-mate)
-    - Punctuation: [1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase)
     """)
 # Launch the app

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification, pipeline
 from difflib import SequenceMatcher
 import re
 # Load punctuation model
 print("Loading punctuation model...")
+punct_tokenizer = AutoTokenizer.from_pretrained("kredor/punctuate-all")
+punct_model = AutoModelForTokenClassification.from_pretrained("kredor/punctuate-all")
+punct_pipeline = pipeline("token-classification", model=punct_model, tokenizer=punct_tokenizer, device=0 if torch.cuda.is_available() else -1)
 print("Punctuation model loaded!")
 def gec_correct(input_text):
     return corrections
 def punct_correct(input_text):
+    """Generate 3 different punctuation corrections using kredor/punctuate-all"""
     if not input_text.strip():
         return ["", "", ""]
     corrections = []
+    # Process with the punctuation pipeline
+    # The model expects lowercase input without punctuation
+    clean_text = input_text.lower()
+    results = punct_pipeline(clean_text)
+    # Build a mapping of token positions to punctuation
+    punct_map = {}
+    current_word = ""
+    current_punct = ""
+    for i, result in enumerate(results):
+        word = result['word'].replace('▁', '').strip()
+        # Get punctuation from entity label
+        entity = result['entity']
+        if entity == 'LABEL_0':
+            punct = ''  # No punctuation
+        elif entity == 'LABEL_1':
+            punct = '.'
+        elif entity == 'LABEL_2':
+            punct = ','
+        elif entity == 'LABEL_3':
+            punct = '?'
+        elif entity == 'LABEL_4':
+            punct = '-'
+        elif entity == 'LABEL_5':
+            punct = ':'
+        else:
+            punct = ''
+        # Check if this is a continuation of previous word (subword token)
+        if not result['word'].startswith('▁') and i > 0:
+            current_word += word
+        else:
+            # Save previous word if exists
+            if current_word:
+                punct_map[current_word] = current_punct
+            current_word = word
+            current_punct = punct
+    # Don't forget the last word
+    if current_word:
+        punct_map[current_word] = current_punct
+    # Reconstruct text with punctuation
+    words = clean_text.split()
+    punctuated_words = []
+    for word in words:
+        # Check if we have punctuation for this word
+        if word in punct_map and punct_map[word]:
+            punctuated_words.append(word + punct_map[word])
+        else:
+            punctuated_words.append(word)
+    # Join words
+    base_result = ' '.join(punctuated_words)
+    # Three variations
+    # 1. Conservative - just punctuation
+    corrections.append(base_result)
+    # 2. With first letter and sentence capitalization
+    sentences = re.split(r'(?<=[.?!])\s+', base_result)
+    capitalized = ' '.join(s[0].upper() + s[1:] if s else s for s in sentences)
+    corrections.append(capitalized)
+    # 3. Clean formatting
+    clean = capitalized
+    for p in [',', '.', '?', ':', '!', ';']:
+        clean = clean.replace(f' {p}', p)
+    corrections.append(clean)
     return corrections
     ---
     **Models:**
     - GEC: [ufal/byt5-large-geccc-mate](https://huggingface.co/ufal/byt5-large-geccc-mate)
+    - Punctuation: [kredor/punctuate-all](https://huggingface.co/kredor/punctuate-all)
     """)
 # Launch the app

requirements.txt CHANGED Viewed

@@ -1,6 +1,3 @@
 gradio>=4.0.0
 torch>=2.0.0
-transformers>=4.30.0
-punctuators==0.0.7
-onnx>=1.14.0
-onnxruntime>=1.15.0

 gradio>=4.0.0
 torch>=2.0.0
+transformers>=4.30.0