mirror of
https://github.com/fhamborg/news-please.git
synced 2021-08-01 22:48:35 +03:00
reorga
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
|
||||
# Crawling heuristics
|
||||
# Default Crawlers:
|
||||
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./news_please/crawler/spiders/-dir)
|
||||
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
|
||||
# default: SitemapCrawler
|
||||
default = SitemapCrawler
|
||||
|
||||
@@ -88,7 +88,7 @@ sitemap_allow_subdomains = True
|
||||
# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
|
||||
# - meta_contains_article_keyword
|
||||
# - crawler_contains_only_article_alikes
|
||||
# (maybe not up-to-date, see ./news_please/helper_classes/heursitics.py:
|
||||
# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
|
||||
# Every method not starting with __ should be a heuristic, except is_article)
|
||||
# These heuristics can be overwritten by sitelist.json for each site
|
||||
# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
|
||||
@@ -132,8 +132,8 @@ relative_to_start_processes_file = True
|
||||
# Here you can specify the input JSON-File
|
||||
# The input-file file containing the base-urls to crawl
|
||||
# absolute and relative file paths are allowed
|
||||
# default: ./config/sitelist.hjson
|
||||
url_input = ./config/sitelist.hjson
|
||||
# default: ../config/sitelist.hjson
|
||||
url_input = ../config/sitelist.hjson
|
||||
|
||||
|
||||
|
||||
@@ -287,8 +287,8 @@ LOG_ENCODING = utf-8
|
||||
|
||||
BOT_NAME = 'news-please'
|
||||
|
||||
SPIDER_MODULES = ['news_please.crawler.spiders']
|
||||
NEWSPIDER_MODULE = 'news_please.crawler.spiders'
|
||||
SPIDER_MODULES = ['newsplease.crawler.spiders']
|
||||
NEWSPIDER_MODULE = 'newsplease.crawler.spiders'
|
||||
|
||||
# Resume/Pause functionality activation
|
||||
# default: .resume_jobdir
|
||||
@@ -313,9 +313,9 @@ USER_AGENT = 'news-please (+http://www.example.com/)'
|
||||
|
||||
# Pipeline activation
|
||||
# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
|
||||
# default: {'news_please.pipeline.pipelines.ArticleMasterExtractor':100, 'news_please.crawler.pipeline.LocalStorage':200, 'news_please.pipeline.pipelines.JsonFileStorage': 300}
|
||||
# Further options: 'news_please.pipeline.pipelines.ElasticsearchStorage': 350
|
||||
ITEM_PIPELINES = {'news_please.pipeline.pipelines.ArticleMasterExtractor':100,
|
||||
'news_please.pipeline.pipelines.LocalStorage':200,
|
||||
'news_please.pipeline.pipelines.JsonFileStorage':300
|
||||
# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
|
||||
# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
|
||||
ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
|
||||
'newsplease.pipeline.pipelines.LocalStorage':200,
|
||||
'newsplease.pipeline.pipelines.JsonFileStorage':300
|
||||
}
|
||||
@@ -11,9 +11,9 @@ from elasticsearch import Elasticsearch
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from news_please.helper_classes.savepath_parser import SavepathParser
|
||||
from news_please.config import JsonConfig
|
||||
from news_please.config import CrawlerConfig
|
||||
from newsplease.helper_classes.savepath_parser import SavepathParser
|
||||
from newsplease.config import JsonConfig
|
||||
from newsplease.config import CrawlerConfig
|
||||
|
||||
try:
|
||||
import builtins
|
||||
@@ -48,14 +48,14 @@ class NewsPlease(object):
|
||||
__single_crawler = False
|
||||
|
||||
def __init__(self):
|
||||
print("news_please is starting on Python " + sys.version)
|
||||
print("newsplease is starting on Python " + sys.version)
|
||||
configure_logging({"LOG_LEVEL": "ERROR"})
|
||||
self.log = logging.getLogger(__name__)
|
||||
|
||||
# Sets an environmental variable called 'CColon', so scripts can import
|
||||
# modules of this project in relation to this script's dir
|
||||
# example: sitemap_crawler can import UrlExtractor via
|
||||
# from news_please.helper_classderes.url_extractor import UrlExtractor
|
||||
# from newsplease.helper_classderes.url_extractor import UrlExtractor
|
||||
os.environ['CColon'] = os.path.dirname(__file__)
|
||||
|
||||
if len(sys.argv) > 1 and (sys.argv[1] == 'help' or
|
||||
@@ -271,12 +271,12 @@ class NewsPlease(object):
|
||||
input_config_file_path)[1] == ".cfg":
|
||||
return input_config_file_path
|
||||
else:
|
||||
self.log.error("First argument passed to news_please "
|
||||
self.log.error("First argument passed to newsplease "
|
||||
"is not the config file. Falling back to "
|
||||
"./config.cfg.")
|
||||
|
||||
# Default
|
||||
return self.get_abs_file_path("./config/config.cfg", quit_on_error=True)
|
||||
return self.get_abs_file_path("../config/config.cfg", quit_on_error=True)
|
||||
|
||||
def print_help(self):
|
||||
"""
|
||||
@@ -284,13 +284,13 @@ class NewsPlease(object):
|
||||
"""
|
||||
_help = (\
|
||||
"""
|
||||
news_please
|
||||
newsplease
|
||||
-----------
|
||||
|
||||
|
||||
Usage:
|
||||
|
||||
news_please [help] [cfg_file_path] [arg] ...
|
||||
newsplease [help] [cfg_file_path] [arg] ...
|
||||
|
||||
|
||||
Arguments:
|
||||
@@ -20,9 +20,9 @@ from scrapy.spiderloader import SpiderLoader
|
||||
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
from news_please.config import CrawlerConfig
|
||||
from news_please.config import JsonConfig
|
||||
from news_please.helper import Helper
|
||||
from newsplease.config import CrawlerConfig
|
||||
from newsplease.config import JsonConfig
|
||||
from newsplease.helper import Helper
|
||||
|
||||
|
||||
class SingleCrawler(object):
|
||||
@@ -41,7 +41,7 @@ class SingleCrawler(object):
|
||||
json_file_path = None
|
||||
cfg_crawler = None
|
||||
__scrapy_options = None
|
||||
__crawer_module = "news_please.crawler.spiders"
|
||||
__crawer_module = "newsplease.crawler.spiders"
|
||||
site_number = None
|
||||
shall_resume = False
|
||||
daemonize = False
|
||||
Reference in New Issue
Block a user