1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/pipeline/extractor/article_extractor.py
2016-11-09 18:33:45 +01:00

65 lines
2.5 KiB
Python

import logging
import importlib
import inspect
from .extractors.abstract_extractor import AbstractExtractor
from .cleaner import Cleaner
from .comparer.comparer import Comparer
class Extractor:
"""This class initializes all extractors and saves the results of them. When adding a new extractor, it needs to
be initialized here and added to list_extractor.
"""
def __init__(self, extractor_list):
"""Initializes all the extractors, comparers and the cleaner.
:param extractor_list: List of strings containing all extractors to be initialized.
"""
self.log = logging.getLogger(__name__)
self.extractor_list = []
for extractor in extractor_list:
module = importlib.import_module(__package__+'.extractors.'+extractor)
# check module for subclasses of AbstractExtractor
for member in inspect.getmembers(module, inspect.isclass):
if issubclass(member[1], AbstractExtractor) and member[0] != 'AbstractExtractor':
# instantiate extractor
instance = getattr(module, member[0], None)()
if instance is not None:
self.log.info('Extractor initialized: %s', extractor)
self.extractor_list.append(instance)
else:
self.log.error("Misconfiguration: An unknown Extractor was found and"
" will be ignored: %s", extractor)
self.cleaner = Cleaner()
self.comparer = Comparer()
def extract(self, item):
"""Runs the HTML-response trough a list of initialized extractors, a cleaner and compares the results.
:param item: NewscrawlerItem to be processed.
:return: An updated NewscrawlerItem including the results of the extraction
"""
article_candidates = []
for extractor in self.extractor_list:
article_candidates.append(extractor.extract(item))
article_candidates = self.cleaner.clean(article_candidates)
article = self.comparer.compare(item, article_candidates)
item['article_title'] = article.title
item['article_description'] = article.description
item['article_text'] = article.text
item['article_image'] = article.topimage
item['article_author'] = article.author
item['article_publish_date'] = article.publish_date
item['article_language'] = article.language
return item