1
0
mirror of https://github.com/michaelharms/comcrawl.git synced 2021-09-27 00:43:48 +03:00
Files
comcrawl-common-crawl/examples/drop_duplicates.py
2020-01-10 18:54:08 +01:00

10 lines
250 B
Python

import comcrawl as cc
results = cc.search("https://index.commoncrawl.org/*")
results = results.sort_values(by="timestamp")
results = results.drop_duplicates("url", keep="first")
results["html"] = cc.download(results)
results.to_csv("results.csv")