1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/pipeline/extractor/extractors/abstract_extractor.py
2016-11-09 18:33:45 +01:00

66 lines
2.0 KiB
Python

from abc import ABCMeta, abstractmethod
from ..article_candidate import ArticleCandidate
class AbstractExtractor:
"""Abstract class for article extractors.
"""
__metaclass__ = ABCMeta
@abstractmethod
def __init__(self):
self.name = None
def _name(self):
"""Returns the name of the article extractor."""
return self.name
def _language(self, item):
"""Returns the language of the extracted article."""
return None
def _title(self, item):
"""Returns the title of the extracted article."""
return None
def _description(self, item):
"""Returns the description/lead paragraph of the extracted article."""
return None
def _text(self, item):
"""Returns the main text of the extracted article."""
return None
def _topimage(self, item):
"""Returns the top image of the extracted article."""
return None
def _author(self, item):
"""Returns the authors of the extracted article."""
return None
def _publish_date(self, item):
"""Returns the publish date of the extracted article."""
return None
def extract(self, item):
"""Executes all implemented functions on the given article and returns an
object containing the recovered data.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name()
article_candidate.title = self._title(item)
article_candidate.description = self._description(item)
article_candidate.text = self._text(item)
article_candidate.topimage = self._topimage(item)
article_candidate.author = self._author(item)
article_candidate.publish_date = self._publish_date(item)
article_candidate.language = self._language(item)
return article_candidate