mount path seperated
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -4,6 +4,6 @@ __pycache__/
|
||||
src/custom/__pycache__/
|
||||
src/generic/__pycache__/
|
||||
.DS_Store
|
||||
src/generic/config/nonrss_crawler_config.yml
|
||||
src/generic/config/rss_crawler_config.yml
|
||||
src/generic/config/
|
||||
src/generic/url_match/
|
||||
src/generic/rss_config/
|
||||
src/generic/nonrss_config/
|
||||
|
||||
144
src/main.py
144
src/main.py
@@ -27,78 +27,78 @@ def main():
|
||||
logger.debug(f"SELENIUM_GRID_ADDRESS: {settings.SELENIUM_GRID_ADDRESS}")
|
||||
logger.debug(f"ERROR_MAIL_LIST: {settings.ERROR_MAIL_LIST}")
|
||||
|
||||
while ast.literal_eval(settings.CRAWLER_DEBUG_MODE):
|
||||
logger.debug(f"in Debug Mode: waiting without execution")
|
||||
time.sleep(1)
|
||||
|
||||
#### ANADOLU AGENCY
|
||||
################
|
||||
driver = utils.open_browser_with_remote(headless=False)
|
||||
try:
|
||||
# read and collect current headlines
|
||||
headlines_aa = anadolu_agency.crawl_headlines()
|
||||
# start crawling each headline content
|
||||
content_aa = anadolu_agency.crawl_content(driver, headlines_aa)
|
||||
utils.crawled_content_into_persistency(content=content_aa, storage_type="db")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except WebDriverException:
|
||||
pass
|
||||
|
||||
#### ENTERPRISE RISK MAG
|
||||
################
|
||||
driver = utils.open_browser_with_remote(headless=True)
|
||||
try:
|
||||
# read and collect current headlines
|
||||
headlines_erm = enterprise_risk_mag.crawl_headlines(driver)
|
||||
# start crawling each headline content
|
||||
content_erm = enterprise_risk_mag.crawl_content(driver, headlines_erm)
|
||||
utils.crawled_content_into_persistency(content=content_erm, storage_type="db")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except WebDriverException:
|
||||
pass
|
||||
|
||||
#### ENERJI.GOV.TR
|
||||
################
|
||||
try:
|
||||
# start crawling rss feed and parse content
|
||||
content_enerji = enerji_govtr.crawl_headlines_content()
|
||||
utils.crawled_content_into_persistency(content=content_enerji, storage_type="db")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
#### GIGAOM
|
||||
################
|
||||
try:
|
||||
# start crawling rss feed and parse content
|
||||
content_gigaom = gigaom.crawl_headlines_content()
|
||||
utils.crawled_content_into_persistency(content=content_gigaom, storage_type="db")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
#### ONSOLVE
|
||||
################
|
||||
driver = utils.open_browser_with_remote(headless=True)
|
||||
try:
|
||||
# read and collect current headlines
|
||||
headlines_onsolve = onsolve.crawl_headlines(driver)
|
||||
# start crawling each headline content
|
||||
content_olsolve = onsolve.crawl_content(driver, headlines_onsolve)
|
||||
utils.crawled_content_into_persistency(content=content_olsolve, storage_type="db")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except WebDriverException:
|
||||
pass
|
||||
# while ast.literal_eval(settings.CRAWLER_DEBUG_MODE):
|
||||
# logger.debug(f"in Debug Mode: waiting without execution")
|
||||
# time.sleep(1)
|
||||
#
|
||||
# #### ANADOLU AGENCY
|
||||
# ################
|
||||
# driver = utils.open_browser_with_remote(headless=False)
|
||||
# try:
|
||||
# # read and collect current headlines
|
||||
# headlines_aa = anadolu_agency.crawl_headlines()
|
||||
# # start crawling each headline content
|
||||
# content_aa = anadolu_agency.crawl_content(driver, headlines_aa)
|
||||
# utils.crawled_content_into_persistency(content=content_aa, storage_type="db")
|
||||
# except Exception as e:
|
||||
# logger.error(e)
|
||||
# finally:
|
||||
# try:
|
||||
# driver.quit()
|
||||
# except WebDriverException:
|
||||
# pass
|
||||
#
|
||||
# #### ENTERPRISE RISK MAG
|
||||
# ################
|
||||
# driver = utils.open_browser_with_remote(headless=True)
|
||||
# try:
|
||||
# # read and collect current headlines
|
||||
# headlines_erm = enterprise_risk_mag.crawl_headlines(driver)
|
||||
# # start crawling each headline content
|
||||
# content_erm = enterprise_risk_mag.crawl_content(driver, headlines_erm)
|
||||
# utils.crawled_content_into_persistency(content=content_erm, storage_type="db")
|
||||
# except Exception as e:
|
||||
# logger.error(e)
|
||||
# finally:
|
||||
# try:
|
||||
# driver.quit()
|
||||
# except WebDriverException:
|
||||
# pass
|
||||
#
|
||||
# #### ENERJI.GOV.TR
|
||||
# ################
|
||||
# try:
|
||||
# # start crawling rss feed and parse content
|
||||
# content_enerji = enerji_govtr.crawl_headlines_content()
|
||||
# utils.crawled_content_into_persistency(content=content_enerji, storage_type="db")
|
||||
# except Exception as e:
|
||||
# logger.error(e)
|
||||
#
|
||||
# #### GIGAOM
|
||||
# ################
|
||||
# try:
|
||||
# # start crawling rss feed and parse content
|
||||
# content_gigaom = gigaom.crawl_headlines_content()
|
||||
# utils.crawled_content_into_persistency(content=content_gigaom, storage_type="db")
|
||||
# except Exception as e:
|
||||
# logger.error(e)
|
||||
#
|
||||
# #### ONSOLVE
|
||||
# ################
|
||||
# driver = utils.open_browser_with_remote(headless=True)
|
||||
# try:
|
||||
# # read and collect current headlines
|
||||
# headlines_onsolve = onsolve.crawl_headlines(driver)
|
||||
# # start crawling each headline content
|
||||
# content_olsolve = onsolve.crawl_content(driver, headlines_onsolve)
|
||||
# utils.crawled_content_into_persistency(content=content_olsolve, storage_type="db")
|
||||
# except Exception as e:
|
||||
# logger.error(e)
|
||||
# finally:
|
||||
# try:
|
||||
# driver.quit()
|
||||
# except WebDriverException:
|
||||
# pass
|
||||
|
||||
#### GENERIC RSS CRAWLER
|
||||
################
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user