1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-19 22:26:00 +03:00

Changes to make Scrapy Item class customizable via configuration

This commit is contained in:
Thihara Neranjya
2020-04-22 19:10:50 +05:30
parent 2b55c29798
commit 26d9e2c34e
5 changed files with 37 additions and 3 deletions

View File

@@ -328,5 +328,7 @@ ITEM_PIPELINES = {'newsplease.pipeline.pipelines.ArticleMasterExtractor':100,
'newsplease.pipeline.pipelines.JsonFileStorage':300
}
ITEM_CLASS = 'newsplease.crawler.items.NewscrawlerItem'
[Pandas]
file_name = "PandasStorage"

View File

@@ -26,6 +26,7 @@ class Helper(object):
format_relative_path,
sites_object,
crawler_class,
crawler_item_class,
working_path
):
if not isinstance(sites_object[0]["url"], list):
@@ -34,4 +35,5 @@ class Helper(object):
self.url_extractor = UrlExtractor()
self.savepath_parser = SavepathParser(
cfg_savepath, relative_to_path, format_relative_path, self, working_path)
self.crawler_item_class = crawler_item_class
self.parse_crawler = ParseCrawler(self)

View File

@@ -0,0 +1,20 @@
import importlib
class ClassLoader:
@classmethod
def from_string(cls, class_name):
if "." not in class_name:
raise ImportError(f"{class_name} does't look like a module path")
module_name = ".".join(class_name.split(".")[:-1])
class_name = class_name.split(".")[-1]
try:
loaded_module = importlib.import_module(module_name)
loaded_class = getattr(loaded_module, class_name)
except Exception as e:
raise ImportError(
f"Module {module_name} does not exist or does not define a class named {class_name}") from e
return loaded_class

View File

@@ -7,8 +7,6 @@ import time
import scrapy
from ..crawler.items import NewscrawlerItem
# to improve performance, regex statements are compiled only once per module
re_html = re.compile('text/html')
@@ -57,7 +55,8 @@ class ParseCrawler(object):
relative_local_path = self.helper.savepath_parser \
.get_savepath(response.url)
article = NewscrawlerItem()
# Instantiate the crawler item class defined in the configuration
article = self.helper.crawler_item_class()
article['local_path'] = self.helper.savepath_parser \
.get_formatted_relative_path(relative_local_path)
article['filename'] = self.helper.savepath_parser.get_filename(article['local_path'])

View File

@@ -23,6 +23,8 @@ sys.path.append(par_path)
from newsplease.config import CrawlerConfig
from newsplease.config import JsonConfig
from newsplease.helper import Helper
from newsplease.helper_classes.module_util import ClassLoader
from newsplease.crawler.items import NewscrawlerItem
try:
from _thread import start_new_thread
@@ -122,12 +124,21 @@ class SingleCrawler(object):
# absolute dir this script is in
relative_to_path = os.path.dirname(__file__)
news_item_class_name = self.cfg.section("Scrapy").get("item_class", None)
if not news_item_class_name:
news_item_class = NewscrawlerItem
else:
news_item_class = ClassLoader.from_string(news_item_class_name)
if not issubclass(news_item_class, NewscrawlerItem):
raise ImportError("ITEM_CLASS must be a subclass of NewscrawlerItem")
self.helper = Helper(self.cfg.section('Heuristics'),
self.cfg.section("Files")["local_data_directory"],
relative_to_path,
self.cfg.section('Files')['format_relative_path'],
sites,
crawler_class,
news_item_class,
self.cfg.get_working_path())
self.__scrapy_options = self.cfg.get_scrapy_options()