1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/pipeline/extractor/cleaner.py
2016-11-09 18:33:45 +01:00

83 lines
3.0 KiB
Python

# -*- coding: utf-8 -*-
import re
import sys
from lxml import html
class Cleaner:
"""The Cleaner-Class tries to get the raw extracted text of the extractors
in a comparable format. For this it deletes unnecessary whitespaces
or in case of readability html-tags which are still in the extracted
text.
"""
def delete_tags(self, arg):
"""Removes html-tags from extracted data.
:param arg: A string, the string which shall be cleaned
:return: A string, the cleaned string
"""
if len(arg) > 0:
raw = html.fromstring(arg)
return raw.text_content().strip()
return arg
def delete_whitespaces(self, arg):
"""Removes newlines, tabs and whitespaces at the beginning, the end and if there is more than one.
:param arg: A string, the string which shell be cleaned
:return: A string, the cleaned string
"""
# Deletes whitespaces after a newline
arg = re.sub(r'(?<=\n)( )+', '', arg)
# Deletes every whitespace, tabulator, newline at the beginning of the string
arg = re.sub(r'^[ \t\n\r\f]*', '', arg)
# Deletes whitespace or tabulator if followed by whitespace or tabulator
arg = re.sub(r'[ \t]+(?=([ \t]))', '', arg)
# Deletes newline if it is followed by an other one
arg = re.sub(r'[ \n]+(?=(\n))', '', arg)
# Deletes newlines and whitespaces at the end of the string
arg = re.sub(r'[ \n]*$', '', arg)
return arg
def do_cleaning(self, arg):
"""Does the actual cleaning by using the delete methods above.
:param arg: A string, the string which shell be cleaned
:return: A string, the cleaned string
"""
if arg is not None:
if sys.version_info[0] < 3:
arg = unicode(arg)
else:
arg = str(arg)
arg = self.delete_tags(arg)
arg = self.delete_whitespaces(arg)
return arg
else:
return None
def clean(self, list_article_candidates):
"""Iterates over each article_candidate and cleans every extracted data.
:param list_article_candidates: A list, the list of ArticleCandidate-Objects which have been extracted
:return: A list, the list with the cleaned ArticleCandidate-Objects
"""
# Save cleaned article_candidates in results.
results = []
for article_candidate in list_article_candidates:
article_candidate.title = self.do_cleaning(article_candidate.title)
article_candidate.description = self.do_cleaning(article_candidate.description)
article_candidate.text = self.do_cleaning(article_candidate.text)
article_candidate.topimage = self.do_cleaning(article_candidate.topimage)
article_candidate.author = self.do_cleaning(article_candidate.author)
article_candidate.publish_date = self.do_cleaning(article_candidate.publish_date)
results.append(article_candidate)
return results