Python爬虫自己写项目之：爬取火车站的时刻表和票务信息

目标给定指定的站点信息（始发站和终点站）和发车时间，得到相应的信息表格代码from selenium import webdriverimport sysfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.select import Selectimport refrom lxml import et

暖仔会飞

1767人浏览 · 2020-08-09 14:15:11

暖仔会飞 · 2020-08-09 14:15:11 发布

目标

给定指定的站点信息（始发站和终点站）和发车时间，得到相应的信息表格

代码


from selenium import webdriver
import sys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import re
from lxml import etree
import pandas as pd
from prettytable import PrettyTable




def driver_construction():
    driver_path = r'C:\Users\dell\Anaconda3\Scripts\chromedriver.exe'
    driver = webdriver.Chrome(driver_path)
    return driver

def parse_argv(argv_lst):

    departure_station = argv_lst[1]
    destination = argv_lst[2]
    departure_time = argv_lst[3]
    time_match = re.search('(.*?)-(.*?)-(.*)', departure_time)
    year = time_match.group(1)
    month = time_match.group(2)
    day = time_match.group(3)
    month_dic = {
        "01":"一月",
        "02":"二月",
        "03":"三月",
        "04":"四月",
        "05":"五月",
        "06":"六月",
        "07":"七月",
        "08":"八月",
        "09":"九月",
        "10":"十月",
        "11":"十一月",
        "12":"十二月",
    }
    month = month_dic[month]
    day_dict = {
        "01": "1",
        "02": "2",
        "03": "3",
        "04": "4",
        "05": "5",
        "06": "6",
        "07": "7",
        "08": "8",
        "09": "9",
    }
    if day in day_dict:
        day = day_dict[day]
    time = (year,month,day)
    return departure_station,destination, time




def input_information_request_page(driver, departure_station,destination, time):

    main_url = "https://www.12306.cn/index/"
    main_page = driver.get(main_url)

    # 将用户输入的信息传入页面中的文本框处
    year,month,day = time

    input_departure_box = driver.find_element_by_id("fromStationText").click()
    departure_station_click = driver.find_element_by_xpath('//li[@title="%s"]'%departure_station).click()

    input_destination = driver.find_element(By.ID,value="toStationText").click()
    destination_click = driver.find_element_by_xpath('//li[@title="%s"]'%destination).click()

    input_departure_time = driver.find_element(By.ID,value="train_date").click()
    year_click = driver.find_element_by_xpath('//div[@class="year"]//input[@type="text"]').click()
    click = driver.find_element_by_xpath('//li[text()="%s"]' % year).click()
    month_click = driver.find_element_by_xpath('//div[@class="month"]//input[@type="text"]').click()
    click = driver.find_element_by_xpath('//li[text()="%s"]' % month).click()
    day_click = driver.find_element_by_xpath('//div[text()=%d]' % int(day)).click()

    # 点击查询按钮
    search_button = driver.find_element_by_id("search_one").click()

    # 将driver转到最新的页面上去
    driver.switch_to_window(driver.window_handles[-1])

    # 获取最新打开的页面源码
    page_source = driver.page_source
    return page_source


def attain_infos_from_page_source(page_source):
    html = etree.HTML(page_source)
    diction = {}
    checi_lst = html.xpath('//a[@title="点击查看停靠站信息"]/text()')
    print(checi_lst)
    for checi in checi_lst:
        info = html.xpath('//tr[contains(@id,"%s")]//text()' % checi)
        if info:
            info.remove(info[1])
            info.remove(info[1])
            info = list(info)[0:7]
        info2 = html.xpath('//td[contains(@hbid,"%s")]//text()' % checi)
        info2 = list(info2)
        info = info + info2
        info = list(info)
        info = list(map(lambda x:x.replace(' ',''),info))
        info = list(map(lambda x:x.ljust(6),info))
        # diction[checi] = info[1:]
        if len(info) == 18:
            diction[checi] = info

    columns_index = ["车次", "出发站", "到达站", "出发时间", "到达时间", "历时", "到达状态", "商务座", "一等座", "二等座", "高级软卧", "软卧", "动卧", "硬卧",
                     "软座", "硬座", "无座", "其他"]
    columns_index = map(lambda x: x.ljust(6), columns_index)
    columns_index = tuple(columns_index)
    diction["title"] = columns_index
    return diction



def print_form(diction):

    tab = PrettyTable()         # 创建格式化表格
    columns_index = diction["title"]
    tab.field_names = columns_index     # 把title行作为第一行
    del diction["title"]                # 把 diction 中 title 这行删除
    for value in diction.values():
        tab.add_row(value)
    return tab



if __name__ == '__main__':
    argv_lst = sys.argv
    driver = driver_construction()
    departure_station, destination, time = parse_argv(argv_lst)
    print(departure_station, destination, time)
    page_source = input_information_request_page(driver, departure_station, destination, time)
    diction = attain_infos_from_page_source(page_source)
    tab = print_form(diction)
    print(tab)
    with open('table.txt','w',encoding='utf-8') as f:
        f.write(str(tab))

结果

在这里插入图片描述

魔乐社区

魔乐社区（Modelers.cn) 是一个中立、公益的人工智能社区，提供人工智能工具、模型、数据的托管、展示与应用协同服务，为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作，由全产业链共同建设、共同运营、共同享有，推动国产AI生态繁荣发展。

更多推荐

全家桶集齐！Qwen3.5四款小模型上线魔乐社区，附昇腾全套实践教程

魔乐社区

Pont - 搭建前后端之桥：高效、灵活的接口管理工具

Pont 是一款强大的数据服务层解决方案，它能够帮助开发者快速搭建前后端之间的桥梁，实现接口的高效管理和代码自动生成。无论是新手还是有经验的开发者，都能通过 Pont 轻松处理接口文档、生成类型安全的 API 代码，从而显著提升开发效率。[![Pont 工具标志](https://raw.gitcode.com/gh_mirrors/po/pont/raw/3f1b7d4bbba3fd2dda