mirror of
https://github.com/ViralLab/TurkishBERTweet.git
synced 2023-12-19 18:19:59 +03:00
prep bug fix
This commit is contained in:
@@ -1,73 +1,74 @@
|
||||
import urllib
|
||||
import html
|
||||
import re
|
||||
|
||||
from urlextract import URLExtract
|
||||
from unicodedata import normalize
|
||||
|
||||
from .demojize import demojize
|
||||
|
||||
|
||||
def hashtag_handler(text: str):
|
||||
pattern = r"(#([^\s]+))"
|
||||
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
|
||||
|
||||
|
||||
def cashtag_handler(text: str):
|
||||
pattern = r"(\$([^\s]+))"
|
||||
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
|
||||
|
||||
|
||||
def mention_handler(text: str):
|
||||
pattern = r"(@([^\s]+))"
|
||||
return re.sub(pattern, " @user ", text)
|
||||
|
||||
|
||||
url_extractor = URLExtract()
|
||||
|
||||
|
||||
def url_handler(text: str):
|
||||
urls = list(url_extractor.gen_urls(text))
|
||||
updated_urls = [url if "http" in url else f"https://{url}" for url in urls]
|
||||
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
|
||||
for i in range(len(domains)):
|
||||
text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
|
||||
return text
|
||||
|
||||
|
||||
def email_handler(text: str):
|
||||
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
|
||||
match = re.findall(pattern, text)
|
||||
for m in match:
|
||||
text = text.replace(m, " <email> ").strip()
|
||||
return text
|
||||
|
||||
|
||||
def emoji_handler(text: str):
|
||||
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
|
||||
|
||||
|
||||
def normalize_text(text: str):
|
||||
return normalize("NFC", text)
|
||||
|
||||
|
||||
def preprocess(text: str):
|
||||
output = html.unescape(text)
|
||||
output = normalize_text(output)
|
||||
output = email_handler(output)
|
||||
output = url_handler(output)
|
||||
output = hashtag_handler(output)
|
||||
output = cashtag_handler(output)
|
||||
output = mention_handler(output)
|
||||
output = emoji_handler(output)
|
||||
output = re.sub(r"\s+", " ", output)
|
||||
output = output.lower()
|
||||
output = output.strip()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sample_text = ""
|
||||
preprocessed_text = preprocess(sample_text)
|
||||
print(preprocessed_text)
|
||||
import urllib
|
||||
import html
|
||||
import re
|
||||
|
||||
from urlextract import URLExtract
|
||||
from unicodedata import normalize
|
||||
|
||||
from .demojize import demojize
|
||||
|
||||
|
||||
def hashtag_handler(text: str):
|
||||
pattern = r"(#([^\s]+))"
|
||||
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
|
||||
|
||||
|
||||
def cashtag_handler(text: str):
|
||||
pattern = r"(\$([^\s]+))"
|
||||
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
|
||||
|
||||
|
||||
def mention_handler(text: str):
|
||||
pattern = r"(@([^\s]+))"
|
||||
return re.sub(pattern, " @user ", text)
|
||||
|
||||
|
||||
url_extractor = URLExtract()
|
||||
|
||||
|
||||
def url_handler(text: str):
|
||||
urls = list(url_extractor.gen_urls(text))
|
||||
updated_urls = [url if "http" in url else f"https://{url}" for url in urls]
|
||||
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
|
||||
domains = list(set(domains))
|
||||
for i in range(len(domains)):
|
||||
text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
|
||||
return text
|
||||
|
||||
|
||||
def email_handler(text: str):
|
||||
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
|
||||
match = re.findall(pattern, text)
|
||||
for m in match:
|
||||
text = text.replace(m, " <email> ").strip()
|
||||
return text
|
||||
|
||||
|
||||
def emoji_handler(text: str):
|
||||
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
|
||||
|
||||
|
||||
def normalize_text(text: str):
|
||||
return normalize("NFC", text)
|
||||
|
||||
|
||||
def preprocess(text: str):
|
||||
output = html.unescape(text)
|
||||
output = normalize_text(output)
|
||||
output = email_handler(output)
|
||||
output = url_handler(output)
|
||||
output = hashtag_handler(output)
|
||||
output = cashtag_handler(output)
|
||||
output = mention_handler(output)
|
||||
output = emoji_handler(output)
|
||||
output = re.sub(r"\s+", " ", output)
|
||||
output = output.lower()
|
||||
output = output.strip()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sample_text = ""
|
||||
preprocessed_text = preprocess(sample_text)
|
||||
print(preprocessed_text)
|
||||
|
||||
Reference in New Issue
Block a user