1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

Merge pull request #203 from mood-mapping-muppets/no-fetch-images-newspaper

Add option of not fetching images using newspaper library and make default for commoncrawl
This commit is contained in:
Felix Hamborg
2021-02-24 08:43:52 +01:00
committed by GitHub
5 changed files with 48 additions and 20 deletions

View File

@@ -25,7 +25,7 @@ class NewsPlease:
"""
@staticmethod
def from_warc(warc_record, decode_errors="replace"):
def from_warc(warc_record, decode_errors="replace", fetch_images=True):
"""
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
extractor.
@@ -52,11 +52,11 @@ class NewsPlease:
raise EmptyResponseError()
url = warc_record.rec_headers.get_header('WARC-Target-URI')
download_date = warc_record.rec_headers.get_header('WARC-Date')
article = NewsPlease.from_html(html, url=url, download_date=download_date)
article = NewsPlease.from_html(html, url=url, download_date=download_date, fetch_images=fetch_images)
return article
@staticmethod
def from_html(html, url=None, download_date=None):
def from_html(html, url=None, download_date=None, fetch_images=True):
"""
Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only
uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease
@@ -66,7 +66,13 @@ class NewsPlease:
:return:
"""
extractor = article_extractor.Extractor(
['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'])
(
['newspaper_extractor']
if fetch_images
else [("newspaper_extractor_no_images", "NewspaperExtractorNoImages")]
) +
['readability_extractor', 'date_extractor', 'lang_detect_extractor']
)
title_encoded = ''.encode()
if not url:

View File

@@ -44,6 +44,8 @@ class CommonCrawlExtractor:
__continue_after_error = False
# ignore unicode errors
__ignore_unicode_errors = False
# fetch images
__fetch_images = False
# log level
__log_level = logging.INFO
__delete_warc_after_extraction = True
@@ -230,7 +232,7 @@ class CommonCrawlExtractor:
return local_filepath
def _from_warc(self, record):
return NewsPlease.from_warc(record, decode_errors="replace" if self.__ignore_unicode_errors else "strict")
return NewsPlease.from_warc(record, decode_errors="replace" if self.__ignore_unicode_errors else "strict", fetch_images=self.__fetch_images)
def __process_warc_gz_file(self, path_name):
"""
@@ -323,8 +325,8 @@ class CommonCrawlExtractor:
valid_hosts=None,
start_date=None, end_date=None,
strict_date=True, reuse_previously_downloaded_files=True, local_download_dir_warc=None,
continue_after_error=True, ignore_unicode_errors=False, show_download_progress=False,
log_level=logging.ERROR, delete_warc_after_extraction=True,
continue_after_error=True, ignore_unicode_errors=False, fetch_images=False,
show_download_progress=False, log_level=logging.ERROR, delete_warc_after_extraction=True,
log_pathname_fully_extracted_warcs=None):
"""
Crawl and extract articles form the news crawl provided by commoncrawl.org. For each article that was extracted
@@ -356,6 +358,7 @@ class CommonCrawlExtractor:
self.__reuse_previously_downloaded_files = reuse_previously_downloaded_files
self.__continue_after_error = continue_after_error
self.__ignore_unicode_errors = ignore_unicode_errors
self.__fetch_images = fetch_images
self.__callback_on_article_extracted = callback_on_article_extracted
self.__callback_on_warc_completed = callback_on_warc_completed
self.__show_download_progress = show_download_progress

View File

@@ -18,24 +18,34 @@ class Extractor:
:param extractor_list: List of strings containing all extractors to be initialized.
"""
def proc_instance(instance):
if instance is not None:
self.log.info('Extractor initialized: %s', extractor)
self.extractor_list.append(instance)
else:
self.log.error("Misconfiguration: An unknown Extractor was found and"
" will be ignored: %s", extractor)
self.log = logging.getLogger(__name__)
self.extractor_list = []
for extractor in extractor_list:
module = importlib.import_module(__package__ + '.extractors.' + extractor)
if isinstance(extractor, tuple):
extractor_module = extractor[0]
else:
extractor_module = extractor
# check module for subclasses of AbstractExtractor
for member in inspect.getmembers(module, inspect.isclass):
if issubclass(member[1], AbstractExtractor) and member[0] != 'AbstractExtractor':
module = importlib.import_module(__package__ + '.extractors.' + extractor_module)
# instantiate extractor
instance = getattr(module, member[0], None)()
if instance is not None:
self.log.info('Extractor initialized: %s', extractor)
self.extractor_list.append(instance)
else:
self.log.error("Misconfiguration: An unknown Extractor was found and"
" will be ignored: %s", extractor)
if isinstance(extractor, tuple):
proc_instance(getattr(module, extractor[1], None)())
else:
# check module for subclasses of AbstractExtractor
for member in inspect.getmembers(module, inspect.isclass):
if issubclass(member[1], AbstractExtractor) and member[0] != 'AbstractExtractor':
# instantiate extractor
proc_instance(getattr(module, member[0], None)())
self.cleaner = Cleaner()
self.comparer = Comparer()

View File

@@ -15,6 +15,9 @@ class NewspaperExtractor(AbstractExtractor):
self.log = logging.getLogger(__name__)
self.name = "newspaper"
def _article_kwargs(self):
return {}
def extract(self, item):
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
parsing the HTML-Code.
@@ -25,7 +28,7 @@ class NewspaperExtractor(AbstractExtractor):
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name()
article = Article('')
article = Article('', **self._article_kwargs())
article.set_html(item['spider_response'].body)
article.parse()
article_candidate.title = article.title

View File

@@ -0,0 +1,6 @@
from .newspaper_extractor import NewspaperExtractor
class NewspaperExtractorNoImages(NewspaperExtractor):
def _article_kwargs(self):
return {"fetch_images": False}