HS section done + no grad is added

2023-12-19 18:19:59 +03:00 · 2023-10-02 20:26:04 +03:00
parent ae9587a524
commit ec7127bed4
1 changed files with 61 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
        - [Twitter Preprocessor](#preprocess)
        - [Feature Extraction](#feature_extraction)
        - [Sentiment Classification](#sa_lora)
+        - [HateSpeech Detection](#hs_lora)

 4. [Citation](#citation)
 # <a name="introduction"></a> TurkishBERTweet in the shadow of Large Language Models
@@ -27,7 +28,8 @@ Model | #params | Arch. | Max length | Pre-training data
 # <a name="loraAdapter"></a> Lora Adapters
 Model | train f1 | dev f1 | test f1 | Dataset Size
 ---|---|---|---|---
-`VRLLab/TurkishBERTweet-Lora-SA` | 0.799 | 0.687 | 0.692 | 42476 Turkish Tweets (uncased)
+`VRLLab/TurkishBERTweet-Lora-SA` | 0.799 | 0.687 | 0.692 | 42,476 Turkish Tweets  
+`VRLLab/TurkishBERTweet-Lora-hs` | 0.915 | 0.796 | 0.831 | 4,683 Turkish Tweets  
 # <a name="usage2"></a> Example usage


@@ -108,11 +110,11 @@ sample_texts = [


 preprocessed_texts = [preprocess(s) for s in sample_texts]
-
-for s in preprocessed_texts:
-    ids = tokenizer.encode_plus(s, return_tensors="pt")
-    label_id = turkishBERTweet_sa(**ids).logits.argmax(-1).item()
-    print(id2label[label_id],":", s)
+with torch.no_grad():
+    for s in preprocessed_texts:
+        ids = tokenizer.encode_plus(s, return_tensors="pt")
+        label_id = turkishBERTweet_sa(**ids).logits.argmax(-1).item()
+        print(id2label[label_id],":", s)
 ```

 ```output
@@ -121,6 +123,59 @@ negative : americanin diplatlari turkiyeye gelmesin <emoji> burundan_buharla_yü
 positive : mark zuckerberg ve elon musk'un boks müsabakası süper olacak! <emoji> kadın_muhafız_koyu_ten_tonu </emoji>
 neutral : adam dun ne yediğini unuttu
 ```
+## <a name="hs_lora"></a> HateSpeech Detection
+```python
+from peft import (
+    PeftModel,
+    PeftConfig,
+)
+
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer)
+from Preprocessor import preprocess
+ 
+
+pretrained_model_path =  "VRLLab/TurkishBERTweet"
+peft_model = "VRLLab/TurkishBERTweet-Lora-SA"
+peft_config = PeftConfig.from_pretrained(peft_model)
+
+# loading Tokenizer
+padding_side = "right"
+tokenizer = AutoTokenizer.from_pretrained(
+    pretrained_model_path, padding_side=padding_side
+)
+if getattr(tokenizer, "pad_token_id") is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+id2label = {0: "negative", 2: "positive", 1: "neutral"}
+turkishBERTweet_hs = AutoModelForSequenceClassification.from_pretrained(
+    peft_config.base_model_name_or_path, return_dict=True, num_labels=2, id2label=id2label
+)
+turkishBERTweet_hs = PeftModel.from_pretrained(turkishBERTweet_hs, peft_model)
+
+
+sample_texts = [
+    "Viral lab da insanlar hep birlikte çalışıyorlar. hepbirlikte çalışan insanlar birbirlerine yakın oluyorlar.",     
+    "kasmayin artik ya kac kere tanik olduk bu azgin tehlikeli \u201cmultecilerin\u201d yaptiklarina? bir afgan taragindan kafasi tasla ezilip tecavuz edilen kiza da git boyle cihangir solculugu yap yerse?",
+    ]
+
+
+preprocessed_texts = [preprocess(s) for s in sample_texts]
+id2label_hs = {0: "No", 1: "Yes"}
+with torch.no_grad():
+    for s in preprocessed_texts:
+        ids = tokenizer.encode_plus(s, return_tensors="pt")
+        label_id = best_model_hs(**ids).logits.argmax(-1).item()
+        print(id2label_hs[label_id],":", s)
+```
+
+```output
+No : viral lab da insanlar hep birlikte çalışıyorlar. hepbirlikte çalışan insanlar birbirlerine yakın oluyorlar.
+Yes : kasmayin artik ya kac kere tanik olduk bu azgin tehlikeli “multecilerin” yaptiklarina? bir afgan taragindan kafasi tasla ezilip tecavuz edilen kiza da git boyle cihangir solculugu yap yerse?
+
+```
+ 

 # <a name="citation"></a> Citation
 ```bibtex