diff --git a/dist/news-please-1.2.25.tar.gz b/dist/news-please-1.2.25.tar.gz new file mode 100644 index 0000000..e9045cd Binary files /dev/null and b/dist/news-please-1.2.25.tar.gz differ diff --git a/dist/news-please-1.2.26.tar.gz b/dist/news-please-1.2.26.tar.gz new file mode 100644 index 0000000..4590350 Binary files /dev/null and b/dist/news-please-1.2.26.tar.gz differ diff --git a/dist/news-please-1.2.27.tar.gz b/dist/news-please-1.2.27.tar.gz new file mode 100644 index 0000000..532b57d Binary files /dev/null and b/dist/news-please-1.2.27.tar.gz differ diff --git a/dist/news-please-1.2.28.tar.gz b/dist/news-please-1.2.28.tar.gz new file mode 100644 index 0000000..fd0d398 Binary files /dev/null and b/dist/news-please-1.2.28.tar.gz differ diff --git a/dist/news-please-1.2.31.tar.gz b/dist/news-please-1.2.31.tar.gz new file mode 100644 index 0000000..9597359 Binary files /dev/null and b/dist/news-please-1.2.31.tar.gz differ diff --git a/dist/news-please-1.2.32.tar.gz b/dist/news-please-1.2.32.tar.gz new file mode 100644 index 0000000..978d109 Binary files /dev/null and b/dist/news-please-1.2.32.tar.gz differ diff --git a/dist/news-please-1.2.33.tar.gz b/dist/news-please-1.2.33.tar.gz new file mode 100644 index 0000000..d6ceb3d Binary files /dev/null and b/dist/news-please-1.2.33.tar.gz differ diff --git a/dist/news-please-1.2.35.tar.gz b/dist/news-please-1.2.35.tar.gz new file mode 100644 index 0000000..7051c82 Binary files /dev/null and b/dist/news-please-1.2.35.tar.gz differ diff --git a/dist/news-please-1.2.36.tar.gz b/dist/news-please-1.2.36.tar.gz new file mode 100644 index 0000000..2d23c1a Binary files /dev/null and b/dist/news-please-1.2.36.tar.gz differ diff --git a/dist/news-please-1.2.39.tar.gz b/dist/news-please-1.2.39.tar.gz new file mode 100644 index 0000000..474ca56 Binary files /dev/null and b/dist/news-please-1.2.39.tar.gz differ diff --git a/dist/news-please-1.2.40.tar.gz b/dist/news-please-1.2.40.tar.gz new file mode 100644 index 0000000..87886b9 Binary files /dev/null and b/dist/news-please-1.2.40.tar.gz differ diff --git a/dist/news-please-1.2.41.tar.gz b/dist/news-please-1.2.41.tar.gz new file mode 100644 index 0000000..8a1f8a9 Binary files /dev/null and b/dist/news-please-1.2.41.tar.gz differ diff --git a/dist/news-please-1.2.42.tar.gz b/dist/news-please-1.2.42.tar.gz new file mode 100644 index 0000000..accc3c8 Binary files /dev/null and b/dist/news-please-1.2.42.tar.gz differ diff --git a/dist/news-please-1.2.43.tar.gz b/dist/news-please-1.2.43.tar.gz new file mode 100644 index 0000000..b203d2b Binary files /dev/null and b/dist/news-please-1.2.43.tar.gz differ diff --git a/dist/news-please-1.2.44.tar.gz b/dist/news-please-1.2.44.tar.gz new file mode 100644 index 0000000..e742cad Binary files /dev/null and b/dist/news-please-1.2.44.tar.gz differ diff --git a/dist/news-please-1.2.50.tar.gz b/dist/news-please-1.2.50.tar.gz new file mode 100644 index 0000000..0ff9e2a Binary files /dev/null and b/dist/news-please-1.2.50.tar.gz differ diff --git a/dist/news-please-1.2.51.tar.gz b/dist/news-please-1.2.51.tar.gz new file mode 100644 index 0000000..d0b4571 Binary files /dev/null and b/dist/news-please-1.2.51.tar.gz differ diff --git a/dist/news-please-1.2.52.tar.gz b/dist/news-please-1.2.52.tar.gz new file mode 100644 index 0000000..c3ffcf5 Binary files /dev/null and b/dist/news-please-1.2.52.tar.gz differ diff --git a/dist/news-please-1.2.53.tar.gz b/dist/news-please-1.2.53.tar.gz new file mode 100644 index 0000000..5fbbbf9 Binary files /dev/null and b/dist/news-please-1.2.53.tar.gz differ diff --git a/dist/news-please-1.3.10.tar.gz b/dist/news-please-1.3.10.tar.gz new file mode 100644 index 0000000..7f6f152 Binary files /dev/null and b/dist/news-please-1.3.10.tar.gz differ diff --git a/dist/news-please-1.3.11.tar.gz b/dist/news-please-1.3.11.tar.gz new file mode 100644 index 0000000..eab79db Binary files /dev/null and b/dist/news-please-1.3.11.tar.gz differ diff --git a/dist/news-please-1.3.13.tar.gz b/dist/news-please-1.3.13.tar.gz new file mode 100644 index 0000000..fa67dab Binary files /dev/null and b/dist/news-please-1.3.13.tar.gz differ diff --git a/dist/news-please-1.3.14.tar.gz b/dist/news-please-1.3.14.tar.gz new file mode 100644 index 0000000..3123866 Binary files /dev/null and b/dist/news-please-1.3.14.tar.gz differ diff --git a/dist/news-please-1.4.10.tar.gz b/dist/news-please-1.4.10.tar.gz new file mode 100644 index 0000000..e094a42 Binary files /dev/null and b/dist/news-please-1.4.10.tar.gz differ diff --git a/dist/news-please-1.4.11.tar.gz b/dist/news-please-1.4.11.tar.gz new file mode 100644 index 0000000..8a4d740 Binary files /dev/null and b/dist/news-please-1.4.11.tar.gz differ diff --git a/dist/news-please-1.4.12.tar.gz b/dist/news-please-1.4.12.tar.gz new file mode 100644 index 0000000..e2f33f7 Binary files /dev/null and b/dist/news-please-1.4.12.tar.gz differ diff --git a/dist/news-please-1.4.13.tar.gz b/dist/news-please-1.4.13.tar.gz new file mode 100644 index 0000000..0f0ea6d Binary files /dev/null and b/dist/news-please-1.4.13.tar.gz differ diff --git a/dist/news-please-1.4.14.tar.gz b/dist/news-please-1.4.14.tar.gz new file mode 100644 index 0000000..f756b2a Binary files /dev/null and b/dist/news-please-1.4.14.tar.gz differ diff --git a/dist/news-please-1.4.15.tar.gz b/dist/news-please-1.4.15.tar.gz new file mode 100644 index 0000000..2e5816f Binary files /dev/null and b/dist/news-please-1.4.15.tar.gz differ diff --git a/dist/news-please-1.4.16.tar.gz b/dist/news-please-1.4.16.tar.gz new file mode 100644 index 0000000..75a68d6 Binary files /dev/null and b/dist/news-please-1.4.16.tar.gz differ diff --git a/dist/news-please-1.4.17.tar.gz b/dist/news-please-1.4.17.tar.gz new file mode 100644 index 0000000..783aee1 Binary files /dev/null and b/dist/news-please-1.4.17.tar.gz differ diff --git a/dist/news-please-1.4.18.tar.gz b/dist/news-please-1.4.18.tar.gz new file mode 100644 index 0000000..4e606a2 Binary files /dev/null and b/dist/news-please-1.4.18.tar.gz differ diff --git a/dist/news-please-1.4.19.tar.gz b/dist/news-please-1.4.19.tar.gz new file mode 100644 index 0000000..ef49268 Binary files /dev/null and b/dist/news-please-1.4.19.tar.gz differ diff --git a/dist/news-please-1.4.20.tar.gz b/dist/news-please-1.4.20.tar.gz new file mode 100644 index 0000000..3b64122 Binary files /dev/null and b/dist/news-please-1.4.20.tar.gz differ diff --git a/dist/news-please-1.4.21.tar.gz b/dist/news-please-1.4.21.tar.gz new file mode 100644 index 0000000..629340e Binary files /dev/null and b/dist/news-please-1.4.21.tar.gz differ diff --git a/dist/news-please-1.4.22.tar.gz b/dist/news-please-1.4.22.tar.gz new file mode 100644 index 0000000..569266f Binary files /dev/null and b/dist/news-please-1.4.22.tar.gz differ diff --git a/dist/news-please-1.4.23.tar.gz b/dist/news-please-1.4.23.tar.gz new file mode 100644 index 0000000..33f0132 Binary files /dev/null and b/dist/news-please-1.4.23.tar.gz differ diff --git a/dist/news-please-1.4.24.tar.gz b/dist/news-please-1.4.24.tar.gz new file mode 100644 index 0000000..5fe93cc Binary files /dev/null and b/dist/news-please-1.4.24.tar.gz differ diff --git a/dist/news-please-1.4.25.tar.gz b/dist/news-please-1.4.25.tar.gz new file mode 100644 index 0000000..e1a0226 Binary files /dev/null and b/dist/news-please-1.4.25.tar.gz differ diff --git a/dist/news-please-1.4.26.tar.gz b/dist/news-please-1.4.26.tar.gz new file mode 100644 index 0000000..8ab8348 Binary files /dev/null and b/dist/news-please-1.4.26.tar.gz differ diff --git a/dist/news-please-1.5.1.tar.gz b/dist/news-please-1.5.1.tar.gz new file mode 100644 index 0000000..bd4211c Binary files /dev/null and b/dist/news-please-1.5.1.tar.gz differ diff --git a/dist/news-please-1.5.2.tar.gz b/dist/news-please-1.5.2.tar.gz new file mode 100644 index 0000000..3262d1e Binary files /dev/null and b/dist/news-please-1.5.2.tar.gz differ diff --git a/news_please.egg-info/PKG-INFO b/news_please.egg-info/PKG-INFO new file mode 100644 index 0000000..834fff4 --- /dev/null +++ b/news_please.egg-info/PKG-INFO @@ -0,0 +1,25 @@ +Metadata-Version: 1.1 +Name: news-please +Version: 1.5.2 +Summary: news-please is an open source easy-to-use news extractor that just works. +Home-page: https://github.com/fhamborg/news-please +Author: Felix Hamborg +Author-email: felix.hamborg@uni-konstanz.de +License: Apache License 2.0 +Download-URL: https://github.com/fhamborg/news-please +Description: news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website. Furthermore, its API allows developers to access the exctraction functionality within their software. news-please also implements a workflow optimized for the news archive provided by commoncrawl.org, allowing users to efficiently crawl and extract news articles including various filter options. +Keywords: news crawler news scraper news extractor crawler extractor scraper information retrieval +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Operating System :: MacOS +Classifier: Operating System :: Microsoft +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Topic :: Internet +Classifier: Topic :: Scientific/Engineering :: Information Analysis diff --git a/news_please.egg-info/SOURCES.txt b/news_please.egg-info/SOURCES.txt new file mode 100644 index 0000000..517cc90 --- /dev/null +++ b/news_please.egg-info/SOURCES.txt @@ -0,0 +1,65 @@ +LICENSE.txt +MANIFEST.in +README.md +requirements.txt +setup.py +news_please.egg-info/PKG-INFO +news_please.egg-info/SOURCES.txt +news_please.egg-info/dependency_links.txt +news_please.egg-info/entry_points.txt +news_please.egg-info/not-zip-safe +news_please.egg-info/requires.txt +news_please.egg-info/top_level.txt +newsplease/NewsArticle.py +newsplease/__init__.py +newsplease/__main__.py +newsplease/config.py +newsplease/helper.py +newsplease/single_crawler.py +newsplease/config/config.cfg +newsplease/config/config_lib.cfg +newsplease/config/sitelist.hjson +newsplease/crawler/__init__.py +newsplease/crawler/commoncrawl_crawler.py +newsplease/crawler/commoncrawl_extractor.py +newsplease/crawler/items.py +newsplease/crawler/simple_crawler.py +newsplease/crawler/spiders/__init__.py +newsplease/crawler/spiders/download_crawler.py +newsplease/crawler/spiders/gdelt_crawler.py +newsplease/crawler/spiders/recursive_crawler.py +newsplease/crawler/spiders/recursive_sitemap_crawler.py +newsplease/crawler/spiders/rss_crawler.py +newsplease/crawler/spiders/sitemap_crawler.py +newsplease/examples/__init__.py +newsplease/examples/commoncrawl.py +newsplease/examples/downloadfromfile.py +newsplease/examples/downloadfromurl.py +newsplease/helper_classes/__init__.py +newsplease/helper_classes/heuristics.py +newsplease/helper_classes/parse_crawler.py +newsplease/helper_classes/savepath_parser.py +newsplease/helper_classes/url_extractor.py +newsplease/helper_classes/sub_classes/__init__.py +newsplease/helper_classes/sub_classes/heuristics_manager.py +newsplease/pipeline/__init__.py +newsplease/pipeline/pipelines.py +newsplease/pipeline/extractor/__init__.py +newsplease/pipeline/extractor/article_candidate.py +newsplease/pipeline/extractor/article_extractor.py +newsplease/pipeline/extractor/cleaner.py +newsplease/pipeline/extractor/comparer/__init__.py +newsplease/pipeline/extractor/comparer/comparer.py +newsplease/pipeline/extractor/comparer/comparer_Language.py +newsplease/pipeline/extractor/comparer/comparer_author.py +newsplease/pipeline/extractor/comparer/comparer_date.py +newsplease/pipeline/extractor/comparer/comparer_description.py +newsplease/pipeline/extractor/comparer/comparer_text.py +newsplease/pipeline/extractor/comparer/comparer_title.py +newsplease/pipeline/extractor/comparer/comparer_topimage.py +newsplease/pipeline/extractor/extractors/__init__.py +newsplease/pipeline/extractor/extractors/abstract_extractor.py +newsplease/pipeline/extractor/extractors/date_extractor.py +newsplease/pipeline/extractor/extractors/lang_detect_extractor.py +newsplease/pipeline/extractor/extractors/newspaper_extractor.py +newsplease/pipeline/extractor/extractors/readability_extractor.py \ No newline at end of file diff --git a/news_please.egg-info/dependency_links.txt b/news_please.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/news_please.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/news_please.egg-info/entry_points.txt b/news_please.egg-info/entry_points.txt new file mode 100644 index 0000000..e77df5e --- /dev/null +++ b/news_please.egg-info/entry_points.txt @@ -0,0 +1,4 @@ +[console_scripts] +news-please = newsplease.__main__:main +news-please-cc = newsplease.examples.commoncrawl:main + diff --git a/news_please.egg-info/not-zip-safe b/news_please.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/news_please.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/news_please.egg-info/requires.txt b/news_please.egg-info/requires.txt new file mode 100644 index 0000000..5757c9f --- /dev/null +++ b/news_please.egg-info/requires.txt @@ -0,0 +1,24 @@ +Scrapy>=1.1.0 +PyMySQL>=0.7.9 +psycopg2-binary>=2.8.4 +hjson>=1.5.8 +elasticsearch>=2.4 +beautifulsoup4>=4.3.2 +readability-lxml>=0.6.2 +newspaper3k>=0.2.8 +langdetect>=1.0.7 +python-dateutil>=2.4.0 +plac>=0.9.6 +dotmap>=1.2.17 +readability-lxml>=0.6.2 +PyDispatcher>=2.0.5 +warcio>=1.3.3 +ago>=0.0.9 +six>=1.10.0 +lxml>=3.3.5 +awscli>=1.11.117 +hurry.filesize>=0.9 +bs4 + +[:sys_platform == "win32"] +pywin32>=220 diff --git a/news_please.egg-info/top_level.txt b/news_please.egg-info/top_level.txt new file mode 100644 index 0000000..441b68c --- /dev/null +++ b/news_please.egg-info/top_level.txt @@ -0,0 +1 @@ +newsplease