mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
Merge pull request #203 from mood-mapping-muppets/no-fetch-images-newspaper
Add option of not fetching images using newspaper library and make default for commoncrawl
This commit is contained in:
@@ -25,7 +25,7 @@ class NewsPlease:
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def from_warc(warc_record, decode_errors="replace"):
|
||||
def from_warc(warc_record, decode_errors="replace", fetch_images=True):
|
||||
"""
|
||||
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
|
||||
extractor.
|
||||
@@ -52,11 +52,11 @@ class NewsPlease:
|
||||
raise EmptyResponseError()
|
||||
url = warc_record.rec_headers.get_header('WARC-Target-URI')
|
||||
download_date = warc_record.rec_headers.get_header('WARC-Date')
|
||||
article = NewsPlease.from_html(html, url=url, download_date=download_date)
|
||||
article = NewsPlease.from_html(html, url=url, download_date=download_date, fetch_images=fetch_images)
|
||||
return article
|
||||
|
||||
@staticmethod
|
||||
def from_html(html, url=None, download_date=None):
|
||||
def from_html(html, url=None, download_date=None, fetch_images=True):
|
||||
"""
|
||||
Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only
|
||||
uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease
|
||||
@@ -66,7 +66,13 @@ class NewsPlease:
|
||||
:return:
|
||||
"""
|
||||
extractor = article_extractor.Extractor(
|
||||
['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'])
|
||||
(
|
||||
['newspaper_extractor']
|
||||
if fetch_images
|
||||
else [("newspaper_extractor_no_images", "NewspaperExtractorNoImages")]
|
||||
) +
|
||||
['readability_extractor', 'date_extractor', 'lang_detect_extractor']
|
||||
)
|
||||
|
||||
title_encoded = ''.encode()
|
||||
if not url:
|
||||
|
||||
@@ -44,6 +44,8 @@ class CommonCrawlExtractor:
|
||||
__continue_after_error = False
|
||||
# ignore unicode errors
|
||||
__ignore_unicode_errors = False
|
||||
# fetch images
|
||||
__fetch_images = False
|
||||
# log level
|
||||
__log_level = logging.INFO
|
||||
__delete_warc_after_extraction = True
|
||||
@@ -230,7 +232,7 @@ class CommonCrawlExtractor:
|
||||
return local_filepath
|
||||
|
||||
def _from_warc(self, record):
|
||||
return NewsPlease.from_warc(record, decode_errors="replace" if self.__ignore_unicode_errors else "strict")
|
||||
return NewsPlease.from_warc(record, decode_errors="replace" if self.__ignore_unicode_errors else "strict", fetch_images=self.__fetch_images)
|
||||
|
||||
def __process_warc_gz_file(self, path_name):
|
||||
"""
|
||||
@@ -323,8 +325,8 @@ class CommonCrawlExtractor:
|
||||
valid_hosts=None,
|
||||
start_date=None, end_date=None,
|
||||
strict_date=True, reuse_previously_downloaded_files=True, local_download_dir_warc=None,
|
||||
continue_after_error=True, ignore_unicode_errors=False, show_download_progress=False,
|
||||
log_level=logging.ERROR, delete_warc_after_extraction=True,
|
||||
continue_after_error=True, ignore_unicode_errors=False, fetch_images=False,
|
||||
show_download_progress=False, log_level=logging.ERROR, delete_warc_after_extraction=True,
|
||||
log_pathname_fully_extracted_warcs=None):
|
||||
"""
|
||||
Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
|
||||
@@ -356,6 +358,7 @@ class CommonCrawlExtractor:
|
||||
self.__reuse_previously_downloaded_files = reuse_previously_downloaded_files
|
||||
self.__continue_after_error = continue_after_error
|
||||
self.__ignore_unicode_errors = ignore_unicode_errors
|
||||
self.__fetch_images = fetch_images
|
||||
self.__callback_on_article_extracted = callback_on_article_extracted
|
||||
self.__callback_on_warc_completed = callback_on_warc_completed
|
||||
self.__show_download_progress = show_download_progress
|
||||
|
||||
@@ -18,24 +18,34 @@ class Extractor:
|
||||
|
||||
:param extractor_list: List of strings containing all extractors to be initialized.
|
||||
"""
|
||||
def proc_instance(instance):
|
||||
if instance is not None:
|
||||
self.log.info('Extractor initialized: %s', extractor)
|
||||
self.extractor_list.append(instance)
|
||||
else:
|
||||
self.log.error("Misconfiguration: An unknown Extractor was found and"
|
||||
" will be ignored: %s", extractor)
|
||||
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.extractor_list = []
|
||||
for extractor in extractor_list:
|
||||
|
||||
module = importlib.import_module(__package__ + '.extractors.' + extractor)
|
||||
if isinstance(extractor, tuple):
|
||||
extractor_module = extractor[0]
|
||||
else:
|
||||
extractor_module = extractor
|
||||
|
||||
# check module for subclasses of AbstractExtractor
|
||||
for member in inspect.getmembers(module, inspect.isclass):
|
||||
if issubclass(member[1], AbstractExtractor) and member[0] != 'AbstractExtractor':
|
||||
module = importlib.import_module(__package__ + '.extractors.' + extractor_module)
|
||||
|
||||
# instantiate extractor
|
||||
instance = getattr(module, member[0], None)()
|
||||
if instance is not None:
|
||||
self.log.info('Extractor initialized: %s', extractor)
|
||||
self.extractor_list.append(instance)
|
||||
else:
|
||||
self.log.error("Misconfiguration: An unknown Extractor was found and"
|
||||
" will be ignored: %s", extractor)
|
||||
if isinstance(extractor, tuple):
|
||||
proc_instance(getattr(module, extractor[1], None)())
|
||||
else:
|
||||
# check module for subclasses of AbstractExtractor
|
||||
for member in inspect.getmembers(module, inspect.isclass):
|
||||
if issubclass(member[1], AbstractExtractor) and member[0] != 'AbstractExtractor':
|
||||
|
||||
# instantiate extractor
|
||||
proc_instance(getattr(module, member[0], None)())
|
||||
|
||||
self.cleaner = Cleaner()
|
||||
self.comparer = Comparer()
|
||||
|
||||
@@ -15,6 +15,9 @@ class NewspaperExtractor(AbstractExtractor):
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.name = "newspaper"
|
||||
|
||||
def _article_kwargs(self):
|
||||
return {}
|
||||
|
||||
def extract(self, item):
|
||||
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
|
||||
parsing the HTML-Code.
|
||||
@@ -25,7 +28,7 @@ class NewspaperExtractor(AbstractExtractor):
|
||||
article_candidate = ArticleCandidate()
|
||||
article_candidate.extractor = self._name()
|
||||
|
||||
article = Article('')
|
||||
article = Article('', **self._article_kwargs())
|
||||
article.set_html(item['spider_response'].body)
|
||||
article.parse()
|
||||
article_candidate.title = article.title
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
from .newspaper_extractor import NewspaperExtractor
|
||||
|
||||
|
||||
class NewspaperExtractorNoImages(NewspaperExtractor):
|
||||
def _article_kwargs(self):
|
||||
return {"fetch_images": False}
|
||||
Reference in New Issue
Block a user