mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-27 00:44:24 +03:00
46 lines
1.7 KiB
Python
46 lines
1.7 KiB
Python
import logging
|
|
from .abstract_extractor import AbstractExtractor
|
|
from ..article_candidate import ArticleCandidate
|
|
# Import Newspaper Article Extractor Library.
|
|
from newspaper import Article
|
|
|
|
|
|
class NewspaperExtractor(AbstractExtractor):
|
|
"""This class implements Newspaper as an article extractor. Newspaper is
|
|
a subclass of ExtractorsInterface
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.log = logging.getLogger(__name__)
|
|
self.name = "newspaper"
|
|
|
|
|
|
|
|
def extract(self, item):
|
|
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
|
|
parsing the HTML-Code.
|
|
|
|
:param item: A NewscrawlerItem to parse.
|
|
:return: ArticleCandidate containing the recovered article data.
|
|
"""
|
|
article_candidate = ArticleCandidate()
|
|
article_candidate.extractor = self._name()
|
|
|
|
article = Article('')
|
|
article.set_html(item['spider_response'].body)
|
|
article.parse()
|
|
article_candidate.title = article.title
|
|
article_candidate.description = article.meta_description
|
|
article_candidate.text = article.text
|
|
article_candidate.topimage = article.top_image
|
|
article_candidate.author = article.authors
|
|
if article.publish_date is not None:
|
|
try:
|
|
article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
|
|
except ValueError as exception:
|
|
self.log.info('%s: Newspaper failed to extract the date in the supported format,'
|
|
'Publishing date set to None' % item['url'])
|
|
article_candidate.language = article.meta_lang
|
|
|
|
return article_candidate
|