python处理xml数据_python解析xml文档,数据清洗,并入库操作
#coding=utf-8importosimportreimportstring__author__='peng'fromxml.domimportminidomimportsysdefxmlToTxt(xmlString):#打开文件把汉字编号gbk改成utf-8格式file_xml=open(xmlString,"r").read()file_xml=fil...
# coding=utf-8
import os
import re
import string
__author__ = 'peng'
from xml.dom import minidom
import sys
def xmlToTxt(xmlString):
#打开文件 把汉字编号gbk改成utf-8格式
file_xml = open(xmlString, "r").read()
file_xml = file_xml.replace('<?xml version="1.0" encoding="gbk" standalone="no" ?>','<?xml version="1.0" encoding="utf-8" standalone="no" ?>')
file_xml = unicode(file_xml, encoding='gbk').encode('utf-8')
doc = minidom.parseString(file_xml)
#解析XML,text和instance的内容
root = doc.documentElement
texts = root.getElementsByTagName("text")
instances = root.getElementsByTagName("instance")
#循环读取instances数据并做数据入库准备
for instance in instances:
print("-------------------------------------------")
print (instance.getAttribute("waveuri").split('/')[1])
text_name = instance.getAttribute("waveuri").split('/')[1]
print("-------------------------------------------")
reload(sys)
sys.setdefaultencoding( "utf-8" )
file_text_name = "e:\\"+text_name+ ".txt"
print(file_text_name.replace(' ',''))
#打开另外一个文件内容,对数据进行填写
file_text = open(file_text_name,"w")
#读取数据并去除标点符号
for text in texts:
print("-------------------------------------------")
print (text.nodeName)
if (len(text.toxml().split(''))==1):
txt = ""
else:
txt =text.toxml().split('')[1].split('')[0]
print(type(txt.encode('utf-8')))
#去除标点符号编码,只能一个一个来,如果写循环会出现乱码,原因未知
finalstring = txt.encode('utf-8')
identify = ','
finalstring = finalstring.replace(identify, "")
identify = '。'
finalstring = finalstring.replace(identify, "")
identify = '!'
finalstring = finalstring.replace(identify, "")
identify = '?'
finalstring = finalstring.replace(identify, "")
print(finalstring)
#最终写入数据
file_text.write(finalstring)
print("-------------------------------------------")
file_text.close()
if ( __name__ == "__main__"):
#文件地址
source = "e:\\" +"文件地址"
#循环读取文件调用方法解析
for root, dirs, files in os.walk( source ):
for OneFileName in files :
print(OneFileName)
xmlToTxt("e:\\"+ "文件地址" +OneFileName)
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐

所有评论(0)