mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-27 00:44:24 +03:00
66 lines
2.0 KiB
Python
66 lines
2.0 KiB
Python
from abc import ABCMeta, abstractmethod
|
|
from ..article_candidate import ArticleCandidate
|
|
|
|
|
|
class AbstractExtractor:
|
|
"""Abstract class for article extractors.
|
|
"""
|
|
|
|
__metaclass__ = ABCMeta
|
|
|
|
@abstractmethod
|
|
def __init__(self):
|
|
self.name = None
|
|
|
|
def _name(self):
|
|
"""Returns the name of the article extractor."""
|
|
return self.name
|
|
|
|
def _language(self, item):
|
|
"""Returns the language of the extracted article."""
|
|
return None
|
|
|
|
def _title(self, item):
|
|
"""Returns the title of the extracted article."""
|
|
return None
|
|
|
|
def _description(self, item):
|
|
"""Returns the description/lead paragraph of the extracted article."""
|
|
return None
|
|
|
|
def _text(self, item):
|
|
"""Returns the main text of the extracted article."""
|
|
return None
|
|
|
|
def _topimage(self, item):
|
|
"""Returns the top image of the extracted article."""
|
|
return None
|
|
|
|
def _author(self, item):
|
|
"""Returns the authors of the extracted article."""
|
|
return None
|
|
|
|
def _publish_date(self, item):
|
|
"""Returns the publish date of the extracted article."""
|
|
return None
|
|
|
|
def extract(self, item):
|
|
"""Executes all implemented functions on the given article and returns an
|
|
object containing the recovered data.
|
|
|
|
:param item: A NewscrawlerItem to parse.
|
|
:return: ArticleCandidate containing the recovered article data.
|
|
"""
|
|
|
|
article_candidate = ArticleCandidate()
|
|
article_candidate.extractor = self._name()
|
|
article_candidate.title = self._title(item)
|
|
article_candidate.description = self._description(item)
|
|
article_candidate.text = self._text(item)
|
|
article_candidate.topimage = self._topimage(item)
|
|
article_candidate.author = self._author(item)
|
|
article_candidate.publish_date = self._publish_date(item)
|
|
article_candidate.language = self._language(item)
|
|
|
|
return article_candidate
|