python3 selenium 抓取网页多个表格数据,并导入execl中
1. 首先我需要抓取数据的网址是:
'https://mtj.baidu.com/data/mobile/device'
2. 打开浏览器,输入网址,是这个页面

3. 我想要抓取,品牌,机型,系统,分辨率,联网右边的表格
4. 使用 pycharm IDE,下载 selenium模块
pip install selenium(请在python安装时勾选pip)
5. 代码如下
#!/usr/bin/env python# -*- coding: UTF-8 -*-from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsimport timeimport openpyxlimport sysimport datetimeimport importlibimport xlwtimport xlrdurl = 'https://mtj.baidu.com/data/mobile/device'def wait(class_name):for trytimes in range(0, 10):# noinspection PyBroadExceptiontry:browser.find_element_by_class_name(class_name).click()breakexcept Exception:time.sleep(0.5)def waits(class_name):for trytimes in range(0, 10):# noinspection PyBroadExceptiontry:element = browser.find_elements_by_class_name(class_name)breakexcept Exception:time.sleep(10)return elementdef save_data(dict):fileName = u'百度研究学院移动平台.xls'# 新建新的Excel文档wb = xlwt.Workbook(encoding='utf-8')for d in dict:sheet = wb.add_sheet(d, cell_overwrite_ok=True)headlist = [d, '占比']row = 0col = 0for head in headlist:sheet.write(col, row, head)row += 1i = 0for data in dict[d]:if (i % 2 == 0):col += 1sheet.write(col, i % 2, data)i += 1wb.save(fileName)def wait_refresh():try:browser.refresh() # 刷新方法 refreshprint ('test pass: refresh successful')time.sleep(1)except Exception as e:print ('Exception found', format(e))def get_data():#保存5个类别的数据,list_button中是class_namelist_button = ['icon-brand','icon-device', 'icon-os', 'icon-screen','icon-network']#字典保存所有数据icon_brand = []icon_device = []icon_os = []icon_screen = []icon_network = []dict = {'icon-brand':icon_brand, 'icon-device':icon_device, 'icon-os':icon_os , 'icon-screen':icon_screen , 'icon-network':icon_network }#分别点击5个按钮,保存数据,品牌,机型,系统分辨率,联网for button in list_button:print('************',button,'********************')wait(button)time.sleep(2)element_name = browser.find_elements_by_class_name('dtd1')element_rank = browser.find_elements_by_class_name('dtd3')name_list = []rank_list = []listen = len(element_name)for name in element_name:print(element_name)name_list.append(name.get_attribute('textContent'))#print(name.get_attribute('textContent'))for rank in element_rank:rank_list.append(rank.get_attribute('textContent'))for i in range(0, listen):dict[button].append(name_list[i])dict[button].append(rank_list[i])print(dict)return dict#######################################################################################打开浏览器browser = webdriver.Chrome()#最大化窗口browser.maximize_window()#输入网址browser.get(url)#获取数据dict_data = get_data()#写入表格importlib.reload(sys)save_data(dict_data)
5. 生成的表格形式如下

赞 (0)
