mirror of
https://github.com/michaelharms/comcrawl.git
synced 2021-09-27 00:43:48 +03:00
10 lines
250 B
Python
10 lines
250 B
Python
import comcrawl as cc
|
|
|
|
results = cc.search("https://index.commoncrawl.org/*")
|
|
|
|
results = results.sort_values(by="timestamp")
|
|
results = results.drop_duplicates("url", keep="first")
|
|
|
|
results["html"] = cc.download(results)
|
|
results.to_csv("results.csv")
|