reorga

2021-08-01 22:48:35 +03:00 · 2016-11-09 18:33:45 +01:00
parent 07ff5473fa
commit 4d8199ff42
45 changed files with 24 additions and 24 deletions
--- a/misc/pypi/MANIFEST.in
+++ b/misc/pypi/MANIFEST.in
--- a/config/config.cfg
+++ b/config/config.cfg
@@ -17,7 +17,7 @@

 # Crawling heuristics
 # Default Crawlers:
-# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./news_please/crawler/spiders/-dir)
+# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
 # default: SitemapCrawler
 default = SitemapCrawler

@@ -88,7 +88,7 @@ sitemap_allow_subdomains = True
 #    - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
 #    - meta_contains_article_keyword
 #    - crawler_contains_only_article_alikes
-# (maybe not up-to-date, see ./news_please/helper_classes/heursitics.py:
+# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
 #  Every method not starting with __ should be a heuristic, except is_article)
 # These heuristics can be overwritten by sitelist.json for each site
 # default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
@@ -132,8 +132,8 @@ relative_to_start_processes_file = True
 # Here you can specify the input JSON-File
 # The input-file file containing the base-urls to crawl
 # absolute and relative file paths are allowed
-# default: ./config/sitelist.hjson
-url_input = ./config/sitelist.hjson
+# default: ../config/sitelist.hjson
+url_input = ../config/sitelist.hjson



@@ -287,8 +287,8 @@ LOG_ENCODING = utf-8

 BOT_NAME = 'news-please'

-SPIDER_MODULES = ['news_please.crawler.spiders']
-NEWSPIDER_MODULE = 'news_please.crawler.spiders'
+SPIDER_MODULES = ['newsplease.crawler.spiders']
+NEWSPIDER_MODULE = 'newsplease.crawler.spiders'

 # Resume/Pause functionality activation
 # default: .resume_jobdir
@@ -313,9 +313,9 @@ USER_AGENT = 'news-please (+http://www.example.com/)'

 # Pipeline activation
 # Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
-# default: {'news_please.pipeline.pipelines.ArticleMasterExtractor':100, 'news_please.crawler.pipeline.LocalStorage':200, 'news_please.pipeline.pipelines.JsonFileStorage': 300}
-# Further options: 'news_please.pipeline.pipelines.ElasticsearchStorage': 350
-ITEM_PIPELINES = {'news_please.pipeline.pipelines.ArticleMasterExtractor':100,
-                  'news_please.pipeline.pipelines.LocalStorage':200,
-                  'news_please.pipeline.pipelines.JsonFileStorage':300
+# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
+# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
+ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
+                  'newsplease.pipeline.pipelines.LocalStorage':200,
+                  'newsplease.pipeline.pipelines.JsonFileStorage':300
                  }
--- a/news_please/pipeline/extractor/extractors/init.py
+++ b/news_please/pipeline/extractor/extractors/init.py
--- a/newsplease/init.py
+++ b/newsplease/init.py
--- a/newsplease/main.py
+++ b/newsplease/main.py
@@ -11,9 +11,9 @@ from elasticsearch import Elasticsearch
 from scrapy.utils.log import configure_logging

 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from news_please.helper_classes.savepath_parser import SavepathParser
-from news_please.config import JsonConfig
-from news_please.config import CrawlerConfig
+from newsplease.helper_classes.savepath_parser import SavepathParser
+from newsplease.config import JsonConfig
+from newsplease.config import CrawlerConfig

 try:
    import builtins
@@ -48,14 +48,14 @@ class NewsPlease(object):
    __single_crawler = False

    def __init__(self):
-        print("news_please is starting on Python " + sys.version)
+        print("newsplease is starting on Python " + sys.version)
        configure_logging({"LOG_LEVEL": "ERROR"})
        self.log = logging.getLogger(__name__)

        # Sets an environmental variable called 'CColon', so scripts can import
        # modules of this project in relation to this script's dir
        # example: sitemap_crawler can import UrlExtractor via
-        #   from news_please.helper_classderes.url_extractor import UrlExtractor
+        #   from newsplease.helper_classderes.url_extractor import UrlExtractor
        os.environ['CColon'] = os.path.dirname(__file__)

        if len(sys.argv) > 1 and (sys.argv[1] == 'help' or
@@ -271,12 +271,12 @@ class NewsPlease(object):
                    input_config_file_path)[1] == ".cfg":
                return input_config_file_path
            else:
-                self.log.error("First argument passed to news_please "
+                self.log.error("First argument passed to newsplease "
                               "is not the config file. Falling back to "
                               "./config.cfg.")

        # Default
-        return self.get_abs_file_path("./config/config.cfg", quit_on_error=True)
+        return self.get_abs_file_path("../config/config.cfg", quit_on_error=True)

    def print_help(self):
        """
@@ -284,13 +284,13 @@ class NewsPlease(object):
        """
        _help = (\
            """
-news_please
+newsplease
 -----------


 Usage:

-    news_please [help] [cfg_file_path] [arg] ...
+    newsplease [help] [cfg_file_path] [arg] ...


 Arguments:
--- a/news_please/config.py
+++ b/news_please/config.py
--- a/newsplease/crawler/init.py
+++ b/newsplease/crawler/init.py
--- a/news_please/crawler/items.py
+++ b/news_please/crawler/items.py
--- a/news_please/crawler/spiders/init.py
+++ b/news_please/crawler/spiders/init.py
--- a/news_please/crawler/spiders/download_crawler.py
+++ b/news_please/crawler/spiders/download_crawler.py
--- a/news_please/crawler/spiders/recursive_crawler.py
+++ b/news_please/crawler/spiders/recursive_crawler.py
--- a/news_please/crawler/spiders/recursive_sitemap_crawler.py
+++ b/news_please/crawler/spiders/recursive_sitemap_crawler.py
--- a/news_please/crawler/spiders/rss_crawler.py
+++ b/news_please/crawler/spiders/rss_crawler.py
--- a/news_please/crawler/spiders/sitemap_crawler.py
+++ b/news_please/crawler/spiders/sitemap_crawler.py
--- a/news_please/helper.py
+++ b/news_please/helper.py
--- a/newsplease/helper_classes/init.py
+++ b/newsplease/helper_classes/init.py
--- a/news_please/helper_classes/heuristics.py
+++ b/news_please/helper_classes/heuristics.py
--- a/news_please/helper_classes/parse_crawler.py
+++ b/news_please/helper_classes/parse_crawler.py
--- a/news_please/helper_classes/savepath_parser.py
+++ b/news_please/helper_classes/savepath_parser.py
--- a/newsplease/helper_classes/sub_classes/init.py
+++ b/newsplease/helper_classes/sub_classes/init.py
--- a/news_please/helper_classes/sub_classes/heuristics_manager.py
+++ b/news_please/helper_classes/sub_classes/heuristics_manager.py
--- a/news_please/helper_classes/url_extractor.py
+++ b/news_please/helper_classes/url_extractor.py
--- a/news_please/helper_classes/sub_classes/init.py
+++ b/news_please/helper_classes/sub_classes/init.py
--- a/newsplease/pipeline/extractor/init.py
+++ b/newsplease/pipeline/extractor/init.py
--- a/news_please/pipeline/extractor/article_candidate.py
+++ b/news_please/pipeline/extractor/article_candidate.py
--- a/news_please/pipeline/extractor/article_extractor.py
+++ b/news_please/pipeline/extractor/article_extractor.py
--- a/news_please/pipeline/extractor/cleaner.py
+++ b/news_please/pipeline/extractor/cleaner.py
--- a/newsplease/pipeline/extractor/comparer/init.py
+++ b/newsplease/pipeline/extractor/comparer/init.py
--- a/news_please/pipeline/extractor/comparer/comparer.py
+++ b/news_please/pipeline/extractor/comparer/comparer.py
--- a/news_please/pipeline/extractor/comparer/comparer_Language.py
+++ b/news_please/pipeline/extractor/comparer/comparer_Language.py
--- a/news_please/pipeline/extractor/comparer/comparer_author.py
+++ b/news_please/pipeline/extractor/comparer/comparer_author.py
--- a/news_please/pipeline/extractor/comparer/comparer_date.py
+++ b/news_please/pipeline/extractor/comparer/comparer_date.py
--- a/news_please/pipeline/extractor/comparer/comparer_description.py
+++ b/news_please/pipeline/extractor/comparer/comparer_description.py
--- a/news_please/pipeline/extractor/comparer/comparer_text.py
+++ b/news_please/pipeline/extractor/comparer/comparer_text.py
--- a/news_please/pipeline/extractor/comparer/comparer_title.py
+++ b/news_please/pipeline/extractor/comparer/comparer_title.py
--- a/news_please/pipeline/extractor/comparer/comparer_topimage.py
+++ b/news_please/pipeline/extractor/comparer/comparer_topimage.py
--- a/newsplease/pipeline/extractor/extractors/init.py
+++ b/newsplease/pipeline/extractor/extractors/init.py
--- a/news_please/pipeline/extractor/extractors/abstract_extractor.py
+++ b/news_please/pipeline/extractor/extractors/abstract_extractor.py
--- a/news_please/pipeline/extractor/extractors/date_extractor.py
+++ b/news_please/pipeline/extractor/extractors/date_extractor.py
--- a/news_please/pipeline/extractor/extractors/lang_detect_extractor.py
+++ b/news_please/pipeline/extractor/extractors/lang_detect_extractor.py
--- a/news_please/pipeline/extractor/extractors/newspaper_extractor.py
+++ b/news_please/pipeline/extractor/extractors/newspaper_extractor.py
--- a/news_please/pipeline/extractor/extractors/readability_extractor.py
+++ b/news_please/pipeline/extractor/extractors/readability_extractor.py
--- a/news_please/pipeline/pipelines.py
+++ b/news_please/pipeline/pipelines.py
--- a/newsplease/single_crawler.py
+++ b/newsplease/single_crawler.py
@@ -20,9 +20,9 @@ from scrapy.spiderloader import SpiderLoader

 from scrapy.utils.log import configure_logging

-from news_please.config import CrawlerConfig
-from news_please.config import JsonConfig
-from news_please.helper import Helper
+from newsplease.config import CrawlerConfig
+from newsplease.config import JsonConfig
+from newsplease.helper import Helper


 class SingleCrawler(object):
@@ -41,7 +41,7 @@ class SingleCrawler(object):
    json_file_path = None
    cfg_crawler = None
    __scrapy_options = None
-    __crawer_module = "news_please.crawler.spiders"
+    __crawer_module = "newsplease.crawler.spiders"
    site_number = None
    shall_resume = False
    daemonize = False
--- a/misc/pypi/setup.py
+++ b/misc/pypi/setup.py