diff --git a/Preprocessor/preprocessor.py b/Preprocessor/preprocessor.py index eaefae8..d7aecc9 100644 --- a/Preprocessor/preprocessor.py +++ b/Preprocessor/preprocessor.py @@ -28,9 +28,10 @@ url_extractor = URLExtract() def url_handler(text: str): urls = list(url_extractor.gen_urls(text)) - updated_urls = [url if "http" in url else f"https://{url}" for url in urls] + updated_urls = list( + set([url if "http" in url else f"https://{url}" for url in urls]) + ) domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls] - domains = list(set(domains)) for i in range(len(domains)): text = text.replace(urls[i], f" {domains[i]} ") return text