mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-27 00:44:24 +03:00
fix bug
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
include newsplease/config/config.cfg
|
||||
include newsplease/config/config_lib.cfg
|
||||
include newsplease/config/sitelist.hjson
|
||||
include LICENSE.txt
|
||||
include README.md
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from newsplease.pipeline.pipelines import InMemoryStorage
|
||||
from newsplease.single_crawler import SingleCrawler
|
||||
|
||||
|
||||
class NewsPleaseLib:
|
||||
"""
|
||||
Access news-please functionality via this interface
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def download_article(url):
|
||||
"""
|
||||
Crawls the article from the url and extracts relevant information.
|
||||
:param url:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(url)
|
||||
results = InMemoryStorage.get_results()
|
||||
article = results[url]
|
||||
del results[url]
|
||||
return article
|
||||
|
||||
@staticmethod
|
||||
def download_articles(urls):
|
||||
"""
|
||||
Crawls articles from the urls and extracts relevant information.
|
||||
:param urls:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(urls)
|
||||
results = InMemoryStorage.get_results()
|
||||
articles = []
|
||||
for url in urls:
|
||||
article = results[url]
|
||||
del results[url]
|
||||
articles.append(article)
|
||||
print(article['title'])
|
||||
return articles
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from newsplease.pipeline.pipelines import InMemoryStorage
|
||||
from newsplease.single_crawler import SingleCrawler
|
||||
|
||||
|
||||
class NewsPleaseLib:
|
||||
"""
|
||||
Access news-please functionality via this interface
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def download_article(url):
|
||||
"""
|
||||
Crawls the article from the url and extracts relevant information.
|
||||
:param url:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(url)
|
||||
results = InMemoryStorage.get_results()
|
||||
article = results[url]
|
||||
del results[url]
|
||||
return article
|
||||
|
||||
@staticmethod
|
||||
def download_articles(urls):
|
||||
"""
|
||||
Crawls articles from the urls and extracts relevant information.
|
||||
:param urls:
|
||||
:return:
|
||||
"""
|
||||
SingleCrawler.create_as_library(urls)
|
||||
results = InMemoryStorage.get_results()
|
||||
articles = []
|
||||
for url in urls:
|
||||
article = results[url]
|
||||
del results[url]
|
||||
articles.append(article)
|
||||
print(article['title'])
|
||||
return articles
|
||||
|
||||
if __name__ == '__main__':
|
||||
NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan')
|
||||
2
setup.py
2
setup.py
@@ -4,7 +4,7 @@ import sys, os
|
||||
|
||||
|
||||
setup(name='news-please',
|
||||
version='1.0.25',
|
||||
version='1.0.27',
|
||||
description="news-please is an open source easy-to-use news extractor that just works.",
|
||||
long_description="""\
|
||||
news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website.""",
|
||||
|
||||
Reference in New Issue
Block a user