mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
Add option of not fetching images using newspaper library and make default for commoncrawl
155 lines
5.6 KiB
Python
155 lines
5.6 KiB
Python
import datetime
|
|
import os
|
|
import sys
|
|
import urllib
|
|
|
|
from bs4.dammit import EncodingDetector
|
|
from six.moves import urllib
|
|
|
|
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
|
|
|
from newsplease.pipeline.extractor import article_extractor
|
|
from newsplease.crawler.items import NewscrawlerItem
|
|
from dotmap import DotMap
|
|
from newsplease.pipeline.pipelines import ExtractedInformationStorage
|
|
from newsplease.crawler.simple_crawler import SimpleCrawler
|
|
|
|
|
|
class EmptyResponseError(ValueError):
|
|
pass
|
|
|
|
|
|
class NewsPlease:
|
|
"""
|
|
Access news-please functionality via this interface
|
|
"""
|
|
|
|
@staticmethod
|
|
def from_warc(warc_record, decode_errors="replace", fetch_images=True):
|
|
"""
|
|
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
|
|
extractor.
|
|
:return:
|
|
"""
|
|
raw_stream = warc_record.raw_stream.read()
|
|
encoding = None
|
|
try:
|
|
encoding = warc_record.http_headers.get_header('Content-Type').split(';')[1].split('=')[1]
|
|
except:
|
|
pass
|
|
if not encoding:
|
|
encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True)
|
|
if not encoding:
|
|
# assume utf-8
|
|
encoding = 'utf-8'
|
|
|
|
try:
|
|
html = raw_stream.decode(encoding, errors=decode_errors)
|
|
except LookupError:
|
|
# non-existent encoding: fallback to utf-9
|
|
html = raw_stream.decode('utf-8', errors=decode_errors)
|
|
if not html:
|
|
raise EmptyResponseError()
|
|
url = warc_record.rec_headers.get_header('WARC-Target-URI')
|
|
download_date = warc_record.rec_headers.get_header('WARC-Date')
|
|
article = NewsPlease.from_html(html, url=url, download_date=download_date, fetch_images=fetch_images)
|
|
return article
|
|
|
|
@staticmethod
|
|
def from_html(html, url=None, download_date=None, fetch_images=True):
|
|
"""
|
|
Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only
|
|
uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease
|
|
to extract the publishing date and title.
|
|
:param html:
|
|
:param url:
|
|
:return:
|
|
"""
|
|
extractor = article_extractor.Extractor(
|
|
(
|
|
['newspaper_extractor']
|
|
if fetch_images
|
|
else [("newspaper_extractor_no_images", "NewspaperExtractorNoImages")]
|
|
) +
|
|
['readability_extractor', 'date_extractor', 'lang_detect_extractor']
|
|
)
|
|
|
|
title_encoded = ''.encode()
|
|
if not url:
|
|
url = ''
|
|
|
|
# if an url was given, we can use that as the filename
|
|
filename = urllib.parse.quote_plus(url) + '.json'
|
|
|
|
item = NewscrawlerItem()
|
|
item['spider_response'] = DotMap()
|
|
item['spider_response'].body = html
|
|
item['url'] = url
|
|
item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode()
|
|
item['html_title'] = title_encoded
|
|
item['rss_title'] = title_encoded
|
|
item['local_path'] = None
|
|
item['filename'] = filename
|
|
item['download_date'] = download_date
|
|
item['modified_date'] = None
|
|
item = extractor.extract(item)
|
|
|
|
tmp_article = ExtractedInformationStorage.extract_relevant_info(item)
|
|
final_article = ExtractedInformationStorage.convert_to_class(tmp_article)
|
|
return final_article
|
|
|
|
@staticmethod
|
|
def from_url(url, timeout=None):
|
|
"""
|
|
Crawls the article from the url and extracts relevant information.
|
|
:param url:
|
|
:param timeout: in seconds, if None, the urllib default is used
|
|
:return: A NewsArticle object containing all the information of the article. Else, None.
|
|
:rtype: NewsArticle, None
|
|
"""
|
|
articles = NewsPlease.from_urls([url], timeout=timeout)
|
|
if url in articles.keys():
|
|
return articles[url]
|
|
else:
|
|
return None
|
|
|
|
@staticmethod
|
|
def from_urls(urls, timeout=None):
|
|
"""
|
|
Crawls articles from the urls and extracts relevant information.
|
|
:param urls:
|
|
:param timeout: in seconds, if None, the urllib default is used
|
|
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
|
|
"""
|
|
results = {}
|
|
download_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
if len(urls) == 0:
|
|
# Nested blocks of code should not be left empty.
|
|
# When a block contains a comment, this block is not considered to be empty
|
|
pass
|
|
elif len(urls) == 1:
|
|
url = urls[0]
|
|
html = SimpleCrawler.fetch_url(url, timeout=timeout)
|
|
results[url] = NewsPlease.from_html(html, url, download_date)
|
|
else:
|
|
results = SimpleCrawler.fetch_urls(urls)
|
|
for url in results:
|
|
results[url] = NewsPlease.from_html(results[url], url, download_date)
|
|
|
|
return results
|
|
|
|
@staticmethod
|
|
def from_file(path):
|
|
"""
|
|
Crawls articles from the urls and extracts relevant information.
|
|
:param path: path to file containing urls (each line contains one URL)
|
|
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
|
|
"""
|
|
with open(path) as f:
|
|
content = f.readlines()
|
|
content = [x.strip() for x in content]
|
|
urls = list(filter(None, content))
|
|
|
|
return NewsPlease.from_urls(urls)
|