mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-27 00:44:24 +03:00
83 lines
3.0 KiB
Python
83 lines
3.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
import re
|
|
import sys
|
|
from lxml import html
|
|
|
|
|
|
class Cleaner:
|
|
"""The Cleaner-Class tries to get the raw extracted text of the extractors
|
|
in a comparable format. For this it deletes unnecessary whitespaces
|
|
or in case of readability html-tags which are still in the extracted
|
|
text.
|
|
"""
|
|
|
|
def delete_tags(self, arg):
|
|
"""Removes html-tags from extracted data.
|
|
|
|
:param arg: A string, the string which shall be cleaned
|
|
:return: A string, the cleaned string
|
|
"""
|
|
|
|
if len(arg) > 0:
|
|
raw = html.fromstring(arg)
|
|
return raw.text_content().strip()
|
|
|
|
return arg
|
|
|
|
def delete_whitespaces(self, arg):
|
|
"""Removes newlines, tabs and whitespaces at the beginning, the end and if there is more than one.
|
|
|
|
:param arg: A string, the string which shell be cleaned
|
|
:return: A string, the cleaned string
|
|
"""
|
|
# Deletes whitespaces after a newline
|
|
arg = re.sub(r'(?<=\n)( )+', '', arg)
|
|
# Deletes every whitespace, tabulator, newline at the beginning of the string
|
|
arg = re.sub(r'^[ \t\n\r\f]*', '', arg)
|
|
# Deletes whitespace or tabulator if followed by whitespace or tabulator
|
|
arg = re.sub(r'[ \t]+(?=([ \t]))', '', arg)
|
|
# Deletes newline if it is followed by an other one
|
|
arg = re.sub(r'[ \n]+(?=(\n))', '', arg)
|
|
# Deletes newlines and whitespaces at the end of the string
|
|
arg = re.sub(r'[ \n]*$', '', arg)
|
|
return arg
|
|
|
|
def do_cleaning(self, arg):
|
|
"""Does the actual cleaning by using the delete methods above.
|
|
|
|
:param arg: A string, the string which shell be cleaned
|
|
:return: A string, the cleaned string
|
|
"""
|
|
if arg is not None:
|
|
if sys.version_info[0] < 3:
|
|
arg = unicode(arg)
|
|
else:
|
|
arg = str(arg)
|
|
arg = self.delete_tags(arg)
|
|
arg = self.delete_whitespaces(arg)
|
|
return arg
|
|
else:
|
|
return None
|
|
|
|
def clean(self, list_article_candidates):
|
|
"""Iterates over each article_candidate and cleans every extracted data.
|
|
|
|
:param list_article_candidates: A list, the list of ArticleCandidate-Objects which have been extracted
|
|
:return: A list, the list with the cleaned ArticleCandidate-Objects
|
|
"""
|
|
# Save cleaned article_candidates in results.
|
|
results = []
|
|
|
|
for article_candidate in list_article_candidates:
|
|
|
|
article_candidate.title = self.do_cleaning(article_candidate.title)
|
|
article_candidate.description = self.do_cleaning(article_candidate.description)
|
|
article_candidate.text = self.do_cleaning(article_candidate.text)
|
|
article_candidate.topimage = self.do_cleaning(article_candidate.topimage)
|
|
article_candidate.author = self.do_cleaning(article_candidate.author)
|
|
article_candidate.publish_date = self.do_cleaning(article_candidate.publish_date)
|
|
|
|
results.append(article_candidate)
|
|
|
|
return results
|