-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXMLParse.py
executable file
·79 lines (57 loc) · 2.12 KB
/
XMLParse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import xml.etree.ElementTree as etree
import os
from pprint import pprint
import pandas as pd
buPath = os.path.dirname(__file__)
class ParseXML():
def __init__(self, filePath):
filePath = os.path.join(buPath, filePath)
self.data = etree.parse(filePath)
@staticmethod
def convertAll(dirPath=""):
dfs = []
for fileName in os.listdir(os.path.join(buPath, dirPath)):
dfs.append(ParseXML.convertXML(os.path.join(dirPath, fileName)))
return dfs
@staticmethod
def convertAllToCSV(dirPath=""):
dfs = []
for fileName in os.listdir(os.path.join(buPath, dirPath)):
df = ParseXML.convertXML(os.path.join(dirPath, fileName))
dfs.append(df)
ParseXML.dfToCSV(df, fileName, dirPath)
return dfs
@staticmethod
def convertXML(filePath="", idField="Id"):
pprint("Converting " + filePath + " to a dataFrame.")
filePath = os.path.join(buPath, filePath)
data = etree.parse(filePath)
dataAsDict = []
for i, child in enumerate(data.getroot()):
entry = {}
entry[child.attrib[idField]] = child.attrib
dataAsDict.append(child.attrib)
df = pd.DataFrame(dataAsDict)
return df
@staticmethod
def dfToCSV(df, fileName, filePath):
pprint("Converting " + fileName + " into a csv")
df.to_csv(os.path.join(filePath, fileName)[3:-3] + "csv")
"""
Modified from https://gist.github.com/mattmc3/712f280ec81044ec7bd12a6dda560787
"""
def convert(self, idField="Id"):
dataAsDict = []
for i, child in enumerate(self.data.getroot()):
entry = {}
entry[child.attrib[idField]] = child.attrib
dataAsDict.append(child.attrib)
df = pd.DataFrame(dataAsDict)
return df
def processData(self):
""" Initiate the root XML, parse it, and return a dataframe"""
return pd.DataFrame(list(self.iter_docs(self.data.getroot())))
def main():
dfs = ParseXML.convertAllToCSV("../../data/serverfault.com/")
if __name__ == "__main__":
main()