python数据采集：某东商品评论采集

某东的python对应代码

2301_79959211

353人浏览 · 2025-02-23 10:59:31

2301_79959211 · 2025-02-23 10:59:31 发布

前言

本文章的jd爬虫专门针对小白设计，不涉及到过深入的知识，主要包含：

1. python代码基本的阅读和编写能力

2. python操作数据库的相关库（这里以pymysql操作mysql为例子）

3. python的request包

4. 网络爬虫的相关知识

因为过于简单，希望各位大佬不喜勿喷

正文

代码解析

数据库部分

def create_table_jingdong():
    # 建立jd的数据库
    conn = pymysql.connect(
        host='127.0.0.1',  # 主机名（或IP地址）
        port=3306,         # 端口号，默认为3306
        user='root',       # 用户名
        password='你的密码', # 密码
        charset='utf8mb4'  # 设置字符编码
    )
    try:
        # 选择你对应的数据库
        conn.select_db("comment")

        # 创建游标对象
        curses = conn.cursor()

        # sql语句，mysql的建表语句
        sql = ("""
                   CREATE TABLE IF NOT EXISTS `jingdong` (
                       id INT AUTO_INCREMENT PRIMARY KEY,
                       product LONGTEXT,
                       location LONGTEXT,
                       comment LONGTEXT,
                       star int,
                       comment_time LONGTEXT,
                       catagory LONGTEXT
                   )
               """)

        # 执行sql语句
        curses.execute(sql)

        # 提交命令
        conn.commit()
        conn.close()

    except Exception as e:
        conn.close()
        print(e)

这里我的数据库jd的表建立在comment数据库下，其中应该包含的数据有，id（数据库的主键，对于数据分析无用），产品名，用户的地点，评论内容，评论星级，评论时间，catagory不用在意，其不是爬取出来的，而是自己设置的，根据你的目标设置，基本没用。

# 执行对mysql的插入
def insert_jingdong(product, comment, star, comment_time,location):
    # 建立京东的数据库
    conn = pymysql.connect(
        host='127.0.0.1',  # 主机名（或IP地址）
        port=3306,         # 端口号，默认为3306
        user='root',       # 用户名
        password='你的密码', # 密码
        charset='utf8mb4'  # 设置字符编码
    )
    try:

        # 选择我的数据库
        conn.select_db("comment")

        # 创建游标对象a
        curses = conn.cursor()

        #找不到，插入
        # sql语句，mysql的插入语句
        sql = f"insert into `jingdong` (product,comment,star,comment_time,location,catagory) VALUES ('{product}','{comment}','{star}','{comment_time}','{location}','{catagory}')"

        # 执行sql语句
        curses.execute(sql)

        # 提交命令
        conn.commit()
        conn.close()
    except Exception as e:
        conn.close()
        print(e)

这是对于jd数据表的插入部分

爬虫部分

# 函数：发起请求到京东并获取特定页面的数据
def start(id,page,ip_proxy):
    # 构建京东商品评论页面的URL
    url = ('https://club.jd.com/comment/productPageComments.action?'
           f'&productId={id}'  # 商品ID      
           f'&score=0'  # 0表示所有评论，1表示好评，2表示中评，3表示差评，5表示追加评论     
           '&sortType=5'  # 排序类型（通常使用5）     
           f'&page={page}'  # 要获取的页面数   
           '&pageSize=10'  # 每页评论数             
           '&isShadowSku=0'
           '&fold=1')

    # 设置headers以模拟浏览器请求
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36"
    }

    time.sleep(2)
    # 发送GET请求获取数据
    response = requests.get(url=url, headers=headers,proxies=ip_proxy)
    # 将返回的JSON数据解析为字典
    data = json.loads(response.text)
    return data

这里是可以不用cookie的，不是设置错误（可以自己实践一次看一看）

产品id的看法：

这个商品编号就是产品id

ip_proxy，这个是你的代理ip，免费的ip代理在很多平台上都有，可以直接在百度或者谷歌上搜索。

注意：

一定要进行一定的停顿，否则会被京东官方限制的，我这里使用time包来停顿，有更高端的也可以使用，但是这个最方便

# 解析函数：从返回的数据中提取所需信息
def parse(data):
    try:
        print("解析开始")
        if 'comments' not in data:
            print("数据缺少 'comments' 键")
            return
        items = data['comments']
        if len(items) == 0:
            return
        for i in items:
            if "location" not in i:
                i["location"] = ""
            yield (
                i['content'],
                i['location'],
                i['referenceName'],
                i['referenceTime'],
                i["score"]
            )
    except Exception as e:
        print(f"解析错误: {e}")

这一部分是对爬取之后的数据进行解析，从而与数据库的数据格式对应起来，方便对于数据库的插入。

# 主函数：控制整个爬取过程
def main():
    total_pages = PAGE  # 设置要爬取的总页数
    for id in idlist:
        for j in range(total_pages + 1):
            time.sleep(1.5)
            current_page = j
            a = random.randint(0, 9)
            data = start(id, current_page, proxy_list[a])

            # 检查数据结构
            if not isinstance(data, dict) or 'comments' not in data:
                print(f"数据格式错误: {data}")
                continue

            # 解析数据
            parsed_data = list(parse(data))  # 转换为列表
            if not parsed_data:
                break

            for item in parsed_data:
                print("item", item)
                insert_jingdong(item[2], item[0], item[4], item[3], item[1])

            print(f"第{id}个 第{current_page + 1}页抓取完毕")

主函数，整理整个程序的步骤，并将全部数据串联起来

代码全貌

# 导入必要的库
import os
import random

import pymysql
import requests
import json
import time
PAGE = 500000

proxy_list =[
    'ip代理的列表'
]
catagory = '泥膜'

idlist=[
    '设置对应商品的id',
]
# 执行对mysql的插入
def insert_jingdong(product, comment, star, comment_time,location):
    # 建立京东的数据库
    conn = pymysql.connect(
        host='127.0.0.1',  # 主机名（或IP地址）
        port=3306,         # 端口号，默认为3306
        user='root',       # 用户名
        password='你的密码', # 密码
        charset='utf8mb4'  # 设置字符编码
    )
    try:

        # 选择我的数据库
        conn.select_db("comment")

        # 创建游标对象a
        curses = conn.cursor()

        #找不到，插入
        # sql语句，mysql的插入语句
        sql = f"insert into `jingdong` (product,comment,star,comment_time,location,catagory) VALUES ('{product}','{comment}','{star}','{comment_time}','{location}','{catagory}')"

        # 执行sql语句
        curses.execute(sql)

        # 提交命令
        conn.commit()
        conn.close()
    except Exception as e:
        conn.close()
        print(e)

# 函数：发起请求到京东并获取特定页面的数据
def start(id,page,ip_proxy):
    # 构建京东商品评论页面的URL
    url = ('https://club.jd.com/comment/productPageComments.action?'
           f'&productId={id}'  # 商品ID      
           f'&score=0'  # 0表示所有评论，1表示好评，2表示中评，3表示差评，5表示追加评论     
           '&sortType=5'  # 排序类型（通常使用5）     
           f'&page={page}'  # 要获取的页面数   
           '&pageSize=10'  # 每页评论数             
           '&isShadowSku=0'
           '&fold=1')

    # 设置headers以模拟浏览器请求
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36"
    }

    time.sleep(2)
    # 发送GET请求获取数据
    response = requests.get(url=url, headers=headers,proxies=ip_proxy)
    # 将返回的JSON数据解析为字典
    data = json.loads(response.text)
    return data


# 解析函数：从返回的数据中提取所需信息
def parse(data):
    try:
        print("解析开始")
        if 'comments' not in data:
            print("数据缺少 'comments' 键")
            return
        items = data['comments']
        if len(items) == 0:
            return
        for i in items:
            if "location" not in i:
                i["location"] = ""
            yield (
                i['content'],
                i['location'],
                i['referenceName'],
                i['referenceTime'],
                i["score"]
            )
    except Exception as e:
        print(f"解析错误: {e}")

# 主函数：控制整个爬取过程
def main():
    total_pages = PAGE  # 设置要爬取的总页数
    for id in idlist:
        for j in range(total_pages + 1):
            time.sleep(1.5)
            current_page = j
            a = random.randint(0, 9)
            data = start(id, current_page, proxy_list[a])

            # 检查数据结构
            if not isinstance(data, dict) or 'comments' not in data:
                print(f"数据格式错误: {data}")
                continue

            # 解析数据
            parsed_data = list(parse(data))  # 转换为列表
            if not parsed_data:
                break

            for item in parsed_data:
                print("item", item)
                insert_jingdong(item[2], item[0], item[4], item[3], item[1])

            print(f"第{id}个 第{current_page + 1}页抓取完毕")


def create_table_jingdong():
    # 建立jd的数据库
    conn = pymysql.connect(
        host='127.0.0.1',  # 主机名（或IP地址）
        port=3306,         # 端口号，默认为3306
        user='root',       # 用户名
        password='你的密码', # 密码
        charset='utf8mb4'  # 设置字符编码
    )
    try:
        # 选择你对应的数据库
        conn.select_db("comment")

        # 创建游标对象
        curses = conn.cursor()

        # sql语句，mysql的建表语句
        sql = ("""
                   CREATE TABLE IF NOT EXISTS `jingdong` (
                       id INT AUTO_INCREMENT PRIMARY KEY,
                       product LONGTEXT,
                       location LONGTEXT,
                       comment LONGTEXT,
                       star int,
                       comment_time LONGTEXT,
                       catagory LONGTEXT
                   )
               """)

        # 执行sql语句
        curses.execute(sql)

        # 提交命令
        conn.commit()
        conn.close()

    except Exception as e:
        conn.close()
        print(e)

# 如果作为独立脚本运行，则执行主函数
if __name__ == '__main__':
    main()