diff --git a/misc/pypi/MANIFEST.in b/MANIFEST.in similarity index 100% rename from misc/pypi/MANIFEST.in rename to MANIFEST.in diff --git a/config/config.cfg b/config/config.cfg index f611136..d49b384 100644 --- a/config/config.cfg +++ b/config/config.cfg @@ -17,7 +17,7 @@ # Crawling heuristics # Default Crawlers: -# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./news_please/crawler/spiders/-dir) +# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir) # default: SitemapCrawler default = SitemapCrawler @@ -88,7 +88,7 @@ sitemap_allow_subdomains = True # - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded) # - meta_contains_article_keyword # - crawler_contains_only_article_alikes -# (maybe not up-to-date, see ./news_please/helper_classes/heursitics.py: +# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py: # Every method not starting with __ should be a heuristic, except is_article) # These heuristics can be overwritten by sitelist.json for each site # default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"} @@ -132,8 +132,8 @@ relative_to_start_processes_file = True # Here you can specify the input JSON-File # The input-file file containing the base-urls to crawl # absolute and relative file paths are allowed -# default: ./config/sitelist.hjson -url_input = ./config/sitelist.hjson +# default: ../config/sitelist.hjson +url_input = ../config/sitelist.hjson @@ -287,8 +287,8 @@ LOG_ENCODING = utf-8 BOT_NAME = 'news-please' -SPIDER_MODULES = ['news_please.crawler.spiders'] -NEWSPIDER_MODULE = 'news_please.crawler.spiders' +SPIDER_MODULES = ['newsplease.crawler.spiders'] +NEWSPIDER_MODULE = 'newsplease.crawler.spiders' # Resume/Pause functionality activation # default: .resume_jobdir @@ -313,9 +313,9 @@ USER_AGENT = 'news-please (+http://www.example.com/)' # Pipeline activation # Syntax: '.': -# default: {'news_please.pipeline.pipelines.ArticleMasterExtractor':100, 'news_please.crawler.pipeline.LocalStorage':200, 'news_please.pipeline.pipelines.JsonFileStorage': 300} -# Further options: 'news_please.pipeline.pipelines.ElasticsearchStorage': 350 -ITEM_PIPELINES = {'news_please.pipeline.pipelines.ArticleMasterExtractor':100, - 'news_please.pipeline.pipelines.LocalStorage':200, - 'news_please.pipeline.pipelines.JsonFileStorage':300 +# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300} +# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350 +ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, + 'newsplease.pipeline.pipelines.LocalStorage':200, + 'newsplease.pipeline.pipelines.JsonFileStorage':300 } \ No newline at end of file diff --git a/news_please/pipeline/extractor/extractors/__init__.py b/news_please/pipeline/extractor/extractors/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/__init__.py b/newsplease/__init__.py similarity index 100% rename from __init__.py rename to newsplease/__init__.py diff --git a/__main__.py b/newsplease/__main__.py similarity index 97% rename from __main__.py rename to newsplease/__main__.py index 866f68e..34aa723 100644 --- a/__main__.py +++ b/newsplease/__main__.py @@ -11,9 +11,9 @@ from elasticsearch import Elasticsearch from scrapy.utils.log import configure_logging sys.path.append(os.path.dirname(os.path.realpath(__file__))) -from news_please.helper_classes.savepath_parser import SavepathParser -from news_please.config import JsonConfig -from news_please.config import CrawlerConfig +from newsplease.helper_classes.savepath_parser import SavepathParser +from newsplease.config import JsonConfig +from newsplease.config import CrawlerConfig try: import builtins @@ -48,14 +48,14 @@ class NewsPlease(object): __single_crawler = False def __init__(self): - print("news_please is starting on Python " + sys.version) + print("newsplease is starting on Python " + sys.version) configure_logging({"LOG_LEVEL": "ERROR"}) self.log = logging.getLogger(__name__) # Sets an environmental variable called 'CColon', so scripts can import # modules of this project in relation to this script's dir # example: sitemap_crawler can import UrlExtractor via - # from news_please.helper_classderes.url_extractor import UrlExtractor + # from newsplease.helper_classderes.url_extractor import UrlExtractor os.environ['CColon'] = os.path.dirname(__file__) if len(sys.argv) > 1 and (sys.argv[1] == 'help' or @@ -271,12 +271,12 @@ class NewsPlease(object): input_config_file_path)[1] == ".cfg": return input_config_file_path else: - self.log.error("First argument passed to news_please " + self.log.error("First argument passed to newsplease " "is not the config file. Falling back to " "./config.cfg.") # Default - return self.get_abs_file_path("./config/config.cfg", quit_on_error=True) + return self.get_abs_file_path("../config/config.cfg", quit_on_error=True) def print_help(self): """ @@ -284,13 +284,13 @@ class NewsPlease(object): """ _help = (\ """ -news_please +newsplease ----------- Usage: - news_please [help] [cfg_file_path] [arg] ... + newsplease [help] [cfg_file_path] [arg] ... Arguments: diff --git a/news_please/config.py b/newsplease/config.py similarity index 100% rename from news_please/config.py rename to newsplease/config.py diff --git a/news_please/__init__.py b/newsplease/crawler/__init__.py similarity index 100% rename from news_please/__init__.py rename to newsplease/crawler/__init__.py diff --git a/news_please/crawler/items.py b/newsplease/crawler/items.py similarity index 100% rename from news_please/crawler/items.py rename to newsplease/crawler/items.py diff --git a/news_please/crawler/spiders/__init__.py b/newsplease/crawler/spiders/__init__.py similarity index 100% rename from news_please/crawler/spiders/__init__.py rename to newsplease/crawler/spiders/__init__.py diff --git a/news_please/crawler/spiders/download_crawler.py b/newsplease/crawler/spiders/download_crawler.py similarity index 100% rename from news_please/crawler/spiders/download_crawler.py rename to newsplease/crawler/spiders/download_crawler.py diff --git a/news_please/crawler/spiders/recursive_crawler.py b/newsplease/crawler/spiders/recursive_crawler.py similarity index 100% rename from news_please/crawler/spiders/recursive_crawler.py rename to newsplease/crawler/spiders/recursive_crawler.py diff --git a/news_please/crawler/spiders/recursive_sitemap_crawler.py b/newsplease/crawler/spiders/recursive_sitemap_crawler.py similarity index 100% rename from news_please/crawler/spiders/recursive_sitemap_crawler.py rename to newsplease/crawler/spiders/recursive_sitemap_crawler.py diff --git a/news_please/crawler/spiders/rss_crawler.py b/newsplease/crawler/spiders/rss_crawler.py similarity index 100% rename from news_please/crawler/spiders/rss_crawler.py rename to newsplease/crawler/spiders/rss_crawler.py diff --git a/news_please/crawler/spiders/sitemap_crawler.py b/newsplease/crawler/spiders/sitemap_crawler.py similarity index 100% rename from news_please/crawler/spiders/sitemap_crawler.py rename to newsplease/crawler/spiders/sitemap_crawler.py diff --git a/news_please/helper.py b/newsplease/helper.py similarity index 100% rename from news_please/helper.py rename to newsplease/helper.py diff --git a/news_please/crawler/__init__.py b/newsplease/helper_classes/__init__.py similarity index 100% rename from news_please/crawler/__init__.py rename to newsplease/helper_classes/__init__.py diff --git a/news_please/helper_classes/heuristics.py b/newsplease/helper_classes/heuristics.py similarity index 100% rename from news_please/helper_classes/heuristics.py rename to newsplease/helper_classes/heuristics.py diff --git a/news_please/helper_classes/parse_crawler.py b/newsplease/helper_classes/parse_crawler.py similarity index 100% rename from news_please/helper_classes/parse_crawler.py rename to newsplease/helper_classes/parse_crawler.py diff --git a/news_please/helper_classes/savepath_parser.py b/newsplease/helper_classes/savepath_parser.py similarity index 100% rename from news_please/helper_classes/savepath_parser.py rename to newsplease/helper_classes/savepath_parser.py diff --git a/news_please/helper_classes/__init__.py b/newsplease/helper_classes/sub_classes/__init__.py similarity index 100% rename from news_please/helper_classes/__init__.py rename to newsplease/helper_classes/sub_classes/__init__.py diff --git a/news_please/helper_classes/sub_classes/heuristics_manager.py b/newsplease/helper_classes/sub_classes/heuristics_manager.py similarity index 100% rename from news_please/helper_classes/sub_classes/heuristics_manager.py rename to newsplease/helper_classes/sub_classes/heuristics_manager.py diff --git a/news_please/helper_classes/url_extractor.py b/newsplease/helper_classes/url_extractor.py similarity index 100% rename from news_please/helper_classes/url_extractor.py rename to newsplease/helper_classes/url_extractor.py diff --git a/news_please/helper_classes/sub_classes/__init__.py b/newsplease/pipeline/__init__.py similarity index 100% rename from news_please/helper_classes/sub_classes/__init__.py rename to newsplease/pipeline/__init__.py diff --git a/news_please/pipeline/__init__.py b/newsplease/pipeline/extractor/__init__.py similarity index 100% rename from news_please/pipeline/__init__.py rename to newsplease/pipeline/extractor/__init__.py diff --git a/news_please/pipeline/extractor/article_candidate.py b/newsplease/pipeline/extractor/article_candidate.py similarity index 100% rename from news_please/pipeline/extractor/article_candidate.py rename to newsplease/pipeline/extractor/article_candidate.py diff --git a/news_please/pipeline/extractor/article_extractor.py b/newsplease/pipeline/extractor/article_extractor.py similarity index 100% rename from news_please/pipeline/extractor/article_extractor.py rename to newsplease/pipeline/extractor/article_extractor.py diff --git a/news_please/pipeline/extractor/cleaner.py b/newsplease/pipeline/extractor/cleaner.py similarity index 100% rename from news_please/pipeline/extractor/cleaner.py rename to newsplease/pipeline/extractor/cleaner.py diff --git a/news_please/pipeline/extractor/__init__.py b/newsplease/pipeline/extractor/comparer/__init__.py similarity index 100% rename from news_please/pipeline/extractor/__init__.py rename to newsplease/pipeline/extractor/comparer/__init__.py diff --git a/news_please/pipeline/extractor/comparer/comparer.py b/newsplease/pipeline/extractor/comparer/comparer.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer.py rename to newsplease/pipeline/extractor/comparer/comparer.py diff --git a/news_please/pipeline/extractor/comparer/comparer_Language.py b/newsplease/pipeline/extractor/comparer/comparer_Language.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_Language.py rename to newsplease/pipeline/extractor/comparer/comparer_Language.py diff --git a/news_please/pipeline/extractor/comparer/comparer_author.py b/newsplease/pipeline/extractor/comparer/comparer_author.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_author.py rename to newsplease/pipeline/extractor/comparer/comparer_author.py diff --git a/news_please/pipeline/extractor/comparer/comparer_date.py b/newsplease/pipeline/extractor/comparer/comparer_date.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_date.py rename to newsplease/pipeline/extractor/comparer/comparer_date.py diff --git a/news_please/pipeline/extractor/comparer/comparer_description.py b/newsplease/pipeline/extractor/comparer/comparer_description.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_description.py rename to newsplease/pipeline/extractor/comparer/comparer_description.py diff --git a/news_please/pipeline/extractor/comparer/comparer_text.py b/newsplease/pipeline/extractor/comparer/comparer_text.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_text.py rename to newsplease/pipeline/extractor/comparer/comparer_text.py diff --git a/news_please/pipeline/extractor/comparer/comparer_title.py b/newsplease/pipeline/extractor/comparer/comparer_title.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_title.py rename to newsplease/pipeline/extractor/comparer/comparer_title.py diff --git a/news_please/pipeline/extractor/comparer/comparer_topimage.py b/newsplease/pipeline/extractor/comparer/comparer_topimage.py similarity index 100% rename from news_please/pipeline/extractor/comparer/comparer_topimage.py rename to newsplease/pipeline/extractor/comparer/comparer_topimage.py diff --git a/news_please/pipeline/extractor/comparer/__init__.py b/newsplease/pipeline/extractor/extractors/__init__.py similarity index 100% rename from news_please/pipeline/extractor/comparer/__init__.py rename to newsplease/pipeline/extractor/extractors/__init__.py diff --git a/news_please/pipeline/extractor/extractors/abstract_extractor.py b/newsplease/pipeline/extractor/extractors/abstract_extractor.py similarity index 100% rename from news_please/pipeline/extractor/extractors/abstract_extractor.py rename to newsplease/pipeline/extractor/extractors/abstract_extractor.py diff --git a/news_please/pipeline/extractor/extractors/date_extractor.py b/newsplease/pipeline/extractor/extractors/date_extractor.py similarity index 100% rename from news_please/pipeline/extractor/extractors/date_extractor.py rename to newsplease/pipeline/extractor/extractors/date_extractor.py diff --git a/news_please/pipeline/extractor/extractors/lang_detect_extractor.py b/newsplease/pipeline/extractor/extractors/lang_detect_extractor.py similarity index 100% rename from news_please/pipeline/extractor/extractors/lang_detect_extractor.py rename to newsplease/pipeline/extractor/extractors/lang_detect_extractor.py diff --git a/news_please/pipeline/extractor/extractors/newspaper_extractor.py b/newsplease/pipeline/extractor/extractors/newspaper_extractor.py similarity index 100% rename from news_please/pipeline/extractor/extractors/newspaper_extractor.py rename to newsplease/pipeline/extractor/extractors/newspaper_extractor.py diff --git a/news_please/pipeline/extractor/extractors/readability_extractor.py b/newsplease/pipeline/extractor/extractors/readability_extractor.py similarity index 100% rename from news_please/pipeline/extractor/extractors/readability_extractor.py rename to newsplease/pipeline/extractor/extractors/readability_extractor.py diff --git a/news_please/pipeline/pipelines.py b/newsplease/pipeline/pipelines.py similarity index 100% rename from news_please/pipeline/pipelines.py rename to newsplease/pipeline/pipelines.py diff --git a/single_crawler.py b/newsplease/single_crawler.py similarity index 97% rename from single_crawler.py rename to newsplease/single_crawler.py index 58e93b8..42e0004 100644 --- a/single_crawler.py +++ b/newsplease/single_crawler.py @@ -20,9 +20,9 @@ from scrapy.spiderloader import SpiderLoader from scrapy.utils.log import configure_logging -from news_please.config import CrawlerConfig -from news_please.config import JsonConfig -from news_please.helper import Helper +from newsplease.config import CrawlerConfig +from newsplease.config import JsonConfig +from newsplease.helper import Helper class SingleCrawler(object): @@ -41,7 +41,7 @@ class SingleCrawler(object): json_file_path = None cfg_crawler = None __scrapy_options = None - __crawer_module = "news_please.crawler.spiders" + __crawer_module = "newsplease.crawler.spiders" site_number = None shall_resume = False daemonize = False diff --git a/misc/pypi/setup.py b/setup.py similarity index 100% rename from misc/pypi/setup.py rename to setup.py