diff --git a/code/grab_list.py b/code/grab_list.py new file mode 100644 index 0000000000000000000000000000000000000000..8293b7a8aa8bdaa4d8db191b277b9a021596788b --- /dev/null +++ b/code/grab_list.py @@ -0,0 +1,28 @@ +#!/usr/bin/python3 +# Driver resources +# chromedriver https://sites.google.com/chromium.org/driver/ +# firefoxdriver https://github.com/mozilla/geckodriver/releases +import time + +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager + +import json + +driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) +#driver = webdriver.Chrome(service=Service('/home/daniel/Bachelorarbeit/code/chromedriver')) + +driver.get("https://www.htmlstrip.com/alexa-top-1000-most-visited-websites") +ls = driver.find_element_by_class_name('table') +entry = ls.find_elements_by_class_name('row') +sites = [] +for e in entry: + x = e.find_elements_by_class_name('col-6')[1].text + sites.append(f"https://www.{x}") +# print(item[u'name']) +with open('alexa1000.json', 'w+') as file: + file.write(json.dumps(sites)) +driver.close()