From b78bb39ddc0d21df8552268e917207c5ffc0ee15 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 24 Feb 2017 17:53:48 +0100 Subject: [PATCH] fix bug --- MANIFEST.in | 1 + newsplease/__init__.py | 42 ++++++++++++++++++++++++++++++++++ newsplease/newspleaselib.py | 45 ------------------------------------- setup.py | 2 +- 4 files changed, 44 insertions(+), 46 deletions(-) delete mode 100644 newsplease/newspleaselib.py diff --git a/MANIFEST.in b/MANIFEST.in index b575402..6c79173 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include newsplease/config/config.cfg +include newsplease/config/config_lib.cfg include newsplease/config/sitelist.hjson include LICENSE.txt include README.md diff --git a/newsplease/__init__.py b/newsplease/__init__.py index e69de29..9b3a193 100644 --- a/newsplease/__init__.py +++ b/newsplease/__init__.py @@ -0,0 +1,42 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from newsplease.pipeline.pipelines import InMemoryStorage +from newsplease.single_crawler import SingleCrawler + + +class NewsPleaseLib: + """ + Access news-please functionality via this interface + """ + + @staticmethod + def download_article(url): + """ + Crawls the article from the url and extracts relevant information. + :param url: + :return: + """ + SingleCrawler.create_as_library(url) + results = InMemoryStorage.get_results() + article = results[url] + del results[url] + return article + + @staticmethod + def download_articles(urls): + """ + Crawls articles from the urls and extracts relevant information. + :param urls: + :return: + """ + SingleCrawler.create_as_library(urls) + results = InMemoryStorage.get_results() + articles = [] + for url in urls: + article = results[url] + del results[url] + articles.append(article) + print(article['title']) + return articles diff --git a/newsplease/newspleaselib.py b/newsplease/newspleaselib.py deleted file mode 100644 index 83d39e1..0000000 --- a/newsplease/newspleaselib.py +++ /dev/null @@ -1,45 +0,0 @@ -import sys -import os - -sys.path.append(os.path.dirname(os.path.realpath(__file__))) -from newsplease.pipeline.pipelines import InMemoryStorage -from newsplease.single_crawler import SingleCrawler - - -class NewsPleaseLib: - """ - Access news-please functionality via this interface - """ - - @staticmethod - def download_article(url): - """ - Crawls the article from the url and extracts relevant information. - :param url: - :return: - """ - SingleCrawler.create_as_library(url) - results = InMemoryStorage.get_results() - article = results[url] - del results[url] - return article - - @staticmethod - def download_articles(urls): - """ - Crawls articles from the urls and extracts relevant information. - :param urls: - :return: - """ - SingleCrawler.create_as_library(urls) - results = InMemoryStorage.get_results() - articles = [] - for url in urls: - article = results[url] - del results[url] - articles.append(article) - print(article['title']) - return articles - -if __name__ == '__main__': - NewsPleaseLib.download_article('http://www.zeit.de/politik/deutschland/2017-02/fluechtlinge-asylverfahren-bamf-taeuschung-afghanistan') \ No newline at end of file diff --git a/setup.py b/setup.py index 8e43f83..e5747a9 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys, os setup(name='news-please', - version='1.0.25', + version='1.0.27', description="news-please is an open source easy-to-use news extractor that just works.", long_description="""\ news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website.""",