爬取京东价格
需要爬取的数据为
价格
商品ID
标题
评价(总评价+优评价)
店名
是否自营
代码实现
import requests
from lxml import etree
import time
import random
import pandas as pd
import json
from sqlalchemy import create_engine
from sqlalchemy.dialects.oracle import DATE,FLOAT,NUMBER,VARCHAR2
import cx_Oracle先导入需要用的包
def create_table(table_name): conn = cx_Oracle.connect('user/password@IP:port/database') cursor = conn.cursor () create_shouji = ''' CREATE TABLE {}( 商品ID VARCHAR2(256), 价格 number(19,8), 店名 VARCHAR2(256) , 店属性 VARCHAR2(256) , 标题 VARCHAR2(256) , 评论 NUMBER(19), 优评论 NUMBER(19) ) '''.format(table_name) cursor.execute(create_shouji) cursor.close() conn.close()
建表
def mapping_df_types(df_pro):
dtypedict = {}
for i, j in zip(df_pro.columns, df_pro.dtypes):
if 'object' in str(j):
dtypedict.update({i: VARCHAR2(256)})
if 'float' in str(j):
dtypedict.update({i: NUMBER(19,8)})
if 'int' in str(j):
dtypedict.update({i: NUMBER(19,8)})
if 'datetime' in str(j):
dtypedict.update({i: DATE})
return dtypedict定义类型的映射
def sava_oracle(df_pro): engine = create_engine('oracle://user:password@ip:port/database') dtypedict = mapping_df_types(df_pro) df_pro.to_sql('shouji',con=engine,index=False,if_exists='append',dtype=dtypedict)
定义请求头和请求方法
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.37'
}
def requesturl(url):
session = requests.Session()
rep = session.get(url,headers=headers)
return rep解析评论的url
def commreq(url_comm): dd_commt = pd.DataFrame(columns=['商品ID','评论','优评论']) session = requests.Session() rep_comm = session.get(url_comm,headers=headers) comment = json.loads(rep_comm.text)['CommentsCount'] comment_list = [] for i in comment: comment_list.append({'商品ID':str(i['ProductId']),'评论':i['CommentCount'],'优评论':i['GoodCount']}) dd_commt = dd_commt.append(comment_list) return dd_commt
主体解析
def parse(rep):
df = pd.DataFrame(columns=['商品ID','价格','店名','店属性','标题'])
html = etree.HTML(rep.text)
all_pro = html.xpath('//ul[@class='gl-warp clearfix']/li')
proid = ','.join(html.xpath('//li/@data-sku'))
# 商品评价url
# referenceIds=之后到&callback之前,都是商品的id,只需要在商品列表获取商品id拼接即可
# 1. 评论解析
url_comm = r'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format(proid)
dd_commt = commreq(url_comm)
# 2. 商品列表信息解析
pro_list = []
for product in all_pro:
proid = ''.join(product.xpath('@data-sku'))
price = ''.join(product.xpath('div[@class='gl-i-wrap']//strong/i/text()'))
target = ''.join(product.xpath('div[@class='gl-i-wrap']//a/em//text()')).replace('\t\n','').replace('\u2122','')
shopname = ''.join(product.xpath('div[@class='gl-i-wrap']//span/a/@title'))
shoptips = product.xpath('div[@class='gl-i-wrap']//i[contains(@class,'goods-icon')]/text()')
if '自营' in shoptips:
shoptips='自营'
else:
shoptips='非自营'
pro_list.append(dict(商品ID=proid,价格=price,店名=shopname,店属性=shoptips,标题=target))
df = df.append(pro_list)
# 3. 合并商品评论和列表
df_pro = pd.merge(df,dd_commt,on='商品ID')
return df_pro加入主程序
if __name__ == '__main__': create_table('shouji') for i in range(1,81): url = 'https://search.jd.com/s_new.php?keyword=手机&wq手机&ev=3613_104528%5E&page={0}&s=30'.format(i) rep = requesturl(url) df_pro = parse(rep) sava_oracle(df_pro) time.sleep(random.randrange(1,4)) print('完成:',i)
赞 (0)
