1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/pipeline/extractor/extractors/newspaper_extractor.py
2016-11-09 18:33:45 +01:00

46 lines
1.7 KiB
Python

import logging
from .abstract_extractor import AbstractExtractor
from ..article_candidate import ArticleCandidate
# Import Newspaper Article Extractor Library.
from newspaper import Article
class NewspaperExtractor(AbstractExtractor):
"""This class implements Newspaper as an article extractor. Newspaper is
a subclass of ExtractorsInterface
"""
def __init__(self):
self.log = logging.getLogger(__name__)
self.name = "newspaper"
def extract(self, item):
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
parsing the HTML-Code.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name()
article = Article('')
article.set_html(item['spider_response'].body)
article.parse()
article_candidate.title = article.title
article_candidate.description = article.meta_description
article_candidate.text = article.text
article_candidate.topimage = article.top_image
article_candidate.author = article.authors
if article.publish_date is not None:
try:
article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
except ValueError as exception:
self.log.info('%s: Newspaper failed to extract the date in the supported format,'
'Publishing date set to None' % item['url'])
article_candidate.language = article.meta_lang
return article_candidate