mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-19 22:26:00 +03:00
Changes to make Scrapy Item class customizable via configuration
This commit is contained in:
@@ -328,5 +328,7 @@ ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
|
||||
'newsplease.pipeline.pipelines.JsonFileStorage':300
|
||||
}
|
||||
|
||||
ITEM_CLASS = 'newsplease.crawler.items.NewscrawlerItem'
|
||||
|
||||
[Pandas]
|
||||
file_name = "PandasStorage"
|
||||
@@ -26,6 +26,7 @@ class Helper(object):
|
||||
format_relative_path,
|
||||
sites_object,
|
||||
crawler_class,
|
||||
crawler_item_class,
|
||||
working_path
|
||||
):
|
||||
if not isinstance(sites_object[0]["url"], list):
|
||||
@@ -34,4 +35,5 @@ class Helper(object):
|
||||
self.url_extractor = UrlExtractor()
|
||||
self.savepath_parser = SavepathParser(
|
||||
cfg_savepath, relative_to_path, format_relative_path, self, working_path)
|
||||
self.crawler_item_class = crawler_item_class
|
||||
self.parse_crawler = ParseCrawler(self)
|
||||
|
||||
20
newsplease/helper_classes/module_util.py
Normal file
20
newsplease/helper_classes/module_util.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import importlib
|
||||
|
||||
|
||||
class ClassLoader:
|
||||
@classmethod
|
||||
def from_string(cls, class_name):
|
||||
if "." not in class_name:
|
||||
raise ImportError(f"{class_name} does't look like a module path")
|
||||
|
||||
module_name = ".".join(class_name.split(".")[:-1])
|
||||
class_name = class_name.split(".")[-1]
|
||||
|
||||
try:
|
||||
loaded_module = importlib.import_module(module_name)
|
||||
loaded_class = getattr(loaded_module, class_name)
|
||||
except Exception as e:
|
||||
raise ImportError(
|
||||
f"Module {module_name} does not exist or does not define a class named {class_name}") from e
|
||||
|
||||
return loaded_class
|
||||
@@ -7,8 +7,6 @@ import time
|
||||
|
||||
import scrapy
|
||||
|
||||
from ..crawler.items import NewscrawlerItem
|
||||
|
||||
# to improve performance, regex statements are compiled only once per module
|
||||
re_html = re.compile('text/html')
|
||||
|
||||
@@ -57,7 +55,8 @@ class ParseCrawler(object):
|
||||
relative_local_path = self.helper.savepath_parser \
|
||||
.get_savepath(response.url)
|
||||
|
||||
article = NewscrawlerItem()
|
||||
# Instantiate the crawler item class defined in the configuration
|
||||
article = self.helper.crawler_item_class()
|
||||
article['local_path'] = self.helper.savepath_parser \
|
||||
.get_formatted_relative_path(relative_local_path)
|
||||
article['filename'] = self.helper.savepath_parser.get_filename(article['local_path'])
|
||||
|
||||
@@ -23,6 +23,8 @@ sys.path.append(par_path)
|
||||
from newsplease.config import CrawlerConfig
|
||||
from newsplease.config import JsonConfig
|
||||
from newsplease.helper import Helper
|
||||
from newsplease.helper_classes.module_util import ClassLoader
|
||||
from newsplease.crawler.items import NewscrawlerItem
|
||||
|
||||
try:
|
||||
from _thread import start_new_thread
|
||||
@@ -122,12 +124,21 @@ class SingleCrawler(object):
|
||||
# absolute dir this script is in
|
||||
relative_to_path = os.path.dirname(__file__)
|
||||
|
||||
news_item_class_name = self.cfg.section("Scrapy").get("item_class", None)
|
||||
if not news_item_class_name:
|
||||
news_item_class = NewscrawlerItem
|
||||
else:
|
||||
news_item_class = ClassLoader.from_string(news_item_class_name)
|
||||
if not issubclass(news_item_class, NewscrawlerItem):
|
||||
raise ImportError("ITEM_CLASS must be a subclass of NewscrawlerItem")
|
||||
|
||||
self.helper = Helper(self.cfg.section('Heuristics'),
|
||||
self.cfg.section("Files")["local_data_directory"],
|
||||
relative_to_path,
|
||||
self.cfg.section('Files')['format_relative_path'],
|
||||
sites,
|
||||
crawler_class,
|
||||
news_item_class,
|
||||
self.cfg.get_working_path())
|
||||
|
||||
self.__scrapy_options = self.cfg.get_scrapy_options()
|
||||
|
||||
Reference in New Issue
Block a user