mount path seperated

This commit is contained in:
MAHMUT YILMAZ
2023-08-24 07:21:59 +03:00
parent 455a821901
commit 63d44850eb
10 changed files with 75 additions and 78 deletions

6
.gitignore vendored
View File

@@ -4,6 +4,6 @@ __pycache__/
src/custom/__pycache__/
src/generic/__pycache__/
.DS_Store
src/generic/config/nonrss_crawler_config.yml
src/generic/config/rss_crawler_config.yml
src/generic/config/
src/generic/url_match/
src/generic/rss_config/
src/generic/nonrss_config/

View File

@@ -27,78 +27,78 @@ def main():
logger.debug(f"SELENIUM_GRID_ADDRESS: {settings.SELENIUM_GRID_ADDRESS}")
logger.debug(f"ERROR_MAIL_LIST: {settings.ERROR_MAIL_LIST}")
while ast.literal_eval(settings.CRAWLER_DEBUG_MODE):
logger.debug(f"in Debug Mode: waiting without execution")
time.sleep(1)
#### ANADOLU AGENCY
################
driver = utils.open_browser_with_remote(headless=False)
try:
# read and collect current headlines
headlines_aa = anadolu_agency.crawl_headlines()
# start crawling each headline content
content_aa = anadolu_agency.crawl_content(driver, headlines_aa)
utils.crawled_content_into_persistency(content=content_aa, storage_type="db")
except Exception as e:
logger.error(e)
finally:
try:
driver.quit()
except WebDriverException:
pass
#### ENTERPRISE RISK MAG
################
driver = utils.open_browser_with_remote(headless=True)
try:
# read and collect current headlines
headlines_erm = enterprise_risk_mag.crawl_headlines(driver)
# start crawling each headline content
content_erm = enterprise_risk_mag.crawl_content(driver, headlines_erm)
utils.crawled_content_into_persistency(content=content_erm, storage_type="db")
except Exception as e:
logger.error(e)
finally:
try:
driver.quit()
except WebDriverException:
pass
#### ENERJI.GOV.TR
################
try:
# start crawling rss feed and parse content
content_enerji = enerji_govtr.crawl_headlines_content()
utils.crawled_content_into_persistency(content=content_enerji, storage_type="db")
except Exception as e:
logger.error(e)
#### GIGAOM
################
try:
# start crawling rss feed and parse content
content_gigaom = gigaom.crawl_headlines_content()
utils.crawled_content_into_persistency(content=content_gigaom, storage_type="db")
except Exception as e:
logger.error(e)
#### ONSOLVE
################
driver = utils.open_browser_with_remote(headless=True)
try:
# read and collect current headlines
headlines_onsolve = onsolve.crawl_headlines(driver)
# start crawling each headline content
content_olsolve = onsolve.crawl_content(driver, headlines_onsolve)
utils.crawled_content_into_persistency(content=content_olsolve, storage_type="db")
except Exception as e:
logger.error(e)
finally:
try:
driver.quit()
except WebDriverException:
pass
# while ast.literal_eval(settings.CRAWLER_DEBUG_MODE):
# logger.debug(f"in Debug Mode: waiting without execution")
# time.sleep(1)
#
# #### ANADOLU AGENCY
# ################
# driver = utils.open_browser_with_remote(headless=False)
# try:
# # read and collect current headlines
# headlines_aa = anadolu_agency.crawl_headlines()
# # start crawling each headline content
# content_aa = anadolu_agency.crawl_content(driver, headlines_aa)
# utils.crawled_content_into_persistency(content=content_aa, storage_type="db")
# except Exception as e:
# logger.error(e)
# finally:
# try:
# driver.quit()
# except WebDriverException:
# pass
#
# #### ENTERPRISE RISK MAG
# ################
# driver = utils.open_browser_with_remote(headless=True)
# try:
# # read and collect current headlines
# headlines_erm = enterprise_risk_mag.crawl_headlines(driver)
# # start crawling each headline content
# content_erm = enterprise_risk_mag.crawl_content(driver, headlines_erm)
# utils.crawled_content_into_persistency(content=content_erm, storage_type="db")
# except Exception as e:
# logger.error(e)
# finally:
# try:
# driver.quit()
# except WebDriverException:
# pass
#
# #### ENERJI.GOV.TR
# ################
# try:
# # start crawling rss feed and parse content
# content_enerji = enerji_govtr.crawl_headlines_content()
# utils.crawled_content_into_persistency(content=content_enerji, storage_type="db")
# except Exception as e:
# logger.error(e)
#
# #### GIGAOM
# ################
# try:
# # start crawling rss feed and parse content
# content_gigaom = gigaom.crawl_headlines_content()
# utils.crawled_content_into_persistency(content=content_gigaom, storage_type="db")
# except Exception as e:
# logger.error(e)
#
# #### ONSOLVE
# ################
# driver = utils.open_browser_with_remote(headless=True)
# try:
# # read and collect current headlines
# headlines_onsolve = onsolve.crawl_headlines(driver)
# # start crawling each headline content
# content_olsolve = onsolve.crawl_content(driver, headlines_onsolve)
# utils.crawled_content_into_persistency(content=content_olsolve, storage_type="db")
# except Exception as e:
# logger.error(e)
# finally:
# try:
# driver.quit()
# except WebDriverException:
# pass
#### GENERIC RSS CRAWLER
################

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long