1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
This commit is contained in:
felix
2016-11-09 18:33:45 +01:00
parent 07ff5473fa
commit 4d8199ff42
45 changed files with 24 additions and 24 deletions

View File

@@ -17,7 +17,7 @@
# Crawling heuristics
# Default Crawlers:
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./news_please/crawler/spiders/-dir)
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newsplease/crawler/spiders/-dir)
# default: SitemapCrawler
default = SitemapCrawler
@@ -88,7 +88,7 @@ sitemap_allow_subdomains = True
# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
# - meta_contains_article_keyword
# - crawler_contains_only_article_alikes
# (maybe not up-to-date, see ./news_please/helper_classes/heursitics.py:
# (maybe not up-to-date, see ./newsplease/helper_classes/heursitics.py:
# Every method not starting with __ should be a heuristic, except is_article)
# These heuristics can be overwritten by sitelist.json for each site
# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
@@ -132,8 +132,8 @@ relative_to_start_processes_file = True
# Here you can specify the input JSON-File
# The input-file file containing the base-urls to crawl
# absolute and relative file paths are allowed
# default: ./config/sitelist.hjson
url_input = ./config/sitelist.hjson
# default: ../config/sitelist.hjson
url_input = ../config/sitelist.hjson
@@ -287,8 +287,8 @@ LOG_ENCODING = utf-8
BOT_NAME = 'news-please'
SPIDER_MODULES = ['news_please.crawler.spiders']
NEWSPIDER_MODULE = 'news_please.crawler.spiders'
SPIDER_MODULES = ['newsplease.crawler.spiders']
NEWSPIDER_MODULE = 'newsplease.crawler.spiders'
# Resume/Pause functionality activation
# default: .resume_jobdir
@@ -313,9 +313,9 @@ USER_AGENT = 'news-please (+http://www.example.com/)'
# Pipeline activation
# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
# default: {'news_please.pipeline.pipelines.ArticleMasterExtractor':100, 'news_please.crawler.pipeline.LocalStorage':200, 'news_please.pipeline.pipelines.JsonFileStorage': 300}
# Further options: 'news_please.pipeline.pipelines.ElasticsearchStorage': 350
ITEM_PIPELINES = {'news_please.pipeline.pipelines.ArticleMasterExtractor':100,
'news_please.pipeline.pipelines.LocalStorage':200,
'news_please.pipeline.pipelines.JsonFileStorage':300
# default: {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100, 'newsplease.crawler.pipeline.LocalStorage':200, 'newsplease.pipeline.pipelines.JsonFileStorage': 300}
# Further options: 'newsplease.pipeline.pipelines.ElasticsearchStorage': 350
ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
'newsplease.pipeline.pipelines.LocalStorage':200,
'newsplease.pipeline.pipelines.JsonFileStorage':300
}

View File

@@ -11,9 +11,9 @@ from elasticsearch import Elasticsearch
from scrapy.utils.log import configure_logging
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from news_please.helper_classes.savepath_parser import SavepathParser
from news_please.config import JsonConfig
from news_please.config import CrawlerConfig
from newsplease.helper_classes.savepath_parser import SavepathParser
from newsplease.config import JsonConfig
from newsplease.config import CrawlerConfig
try:
import builtins
@@ -48,14 +48,14 @@ class NewsPlease(object):
__single_crawler = False
def __init__(self):
print("news_please is starting on Python " + sys.version)
print("newsplease is starting on Python " + sys.version)
configure_logging({"LOG_LEVEL": "ERROR"})
self.log = logging.getLogger(__name__)
# Sets an environmental variable called 'CColon', so scripts can import
# modules of this project in relation to this script's dir
# example: sitemap_crawler can import UrlExtractor via
# from news_please.helper_classderes.url_extractor import UrlExtractor
# from newsplease.helper_classderes.url_extractor import UrlExtractor
os.environ['CColon'] = os.path.dirname(__file__)
if len(sys.argv) > 1 and (sys.argv[1] == 'help' or
@@ -271,12 +271,12 @@ class NewsPlease(object):
input_config_file_path)[1] == ".cfg":
return input_config_file_path
else:
self.log.error("First argument passed to news_please "
self.log.error("First argument passed to newsplease "
"is not the config file. Falling back to "
"./config.cfg.")
# Default
return self.get_abs_file_path("./config/config.cfg", quit_on_error=True)
return self.get_abs_file_path("../config/config.cfg", quit_on_error=True)
def print_help(self):
"""
@@ -284,13 +284,13 @@ class NewsPlease(object):
"""
_help = (\
"""
news_please
newsplease
-----------
Usage:
news_please [help] [cfg_file_path] [arg] ...
newsplease [help] [cfg_file_path] [arg] ...
Arguments:

View File

@@ -20,9 +20,9 @@ from scrapy.spiderloader import SpiderLoader
from scrapy.utils.log import configure_logging
from news_please.config import CrawlerConfig
from news_please.config import JsonConfig
from news_please.helper import Helper
from newsplease.config import CrawlerConfig
from newsplease.config import JsonConfig
from newsplease.helper import Helper
class SingleCrawler(object):
@@ -41,7 +41,7 @@ class SingleCrawler(object):
json_file_path = None
cfg_crawler = None
__scrapy_options = None
__crawer_module = "news_please.crawler.spiders"
__crawer_module = "newsplease.crawler.spiders"
site_number = None
shall_resume = False
daemonize = False