-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patholdfeedparser.py
110 lines (95 loc) · 3.36 KB
/
oldfeedparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import xml.etree.ElementTree
def parse_xml(feed_uri):
data = download_file(feed_uri)
root_element = xml.etree.ElementTree.fromstring(data)
contents = []
if root_element.tag != "rss" and root_element.tag != "feed":
raise NameError("The root element is not rss, it may be a html file")
if root_element.tag == "rss":
for channel in root_element:
if channel.tag != "channel": # another node?? not valid, let's skip it
print("Root element contains a node of type %s" % channel.tag)
else:
contents.append(parse_channel(channel))
elif root_element.tag == "atom":
contents.append(parse_root_atom(root_element))
return contents
def parse_item(item):
title = None
link = None
description = None
pubDate = None
guid = None
for node in item:
if node.tag == "title":
title = node.text
elif node.tag == "link":
link = node.text
elif node.tag == "description":
description = node.text
elif node.tag == "pubDate":
pubDate = time.strftime("%Y-%m-%d %H:%M:%S", email.utils.parsedate(node.text))
elif node.tag == "guid":
guid = node.text
else:
print("%s element not handled" % node.tag)
return dict(title=title, link=link, description=description, pubDate=pubDate, guid=guid)
def parse_atom_item(item):
title = None
link = None
description = None
pubDate = None
guid = None
for node in item:
if node.tag == "title":
title = node.text
elif node.tag == "link":
link = node.get("href")
elif node.tag == "content":
description = node.text
elif node.tag == "published":
pubDate = time.strftime("%Y-%m-%d %H:%M:%S", email.utils.parsedate(node.text))
elif node.tag == "id":
guid = node.text
else:
print("%s element not handled" % node.tag)
return dict(title=title, link=link, description=description, pubDate=pubDate, guid=guid)
def parse_channel(channel):
title = None
language = None
description = None
link = None
articles = []
for node in channel:
if node.tag == "title":
title = node.text
elif node.tag == "link":
link = node.text
elif node.tag == "language":
language = node.text
elif node.tag == "description":
description = node.text
elif node.tag == "item":
articles.append(parse_item(node))
else:
print("%s element not handled" % node.tag)
return dict(articles=articles, title=title, language=language, description=description, link=link)
def parse_root_atom(channel):
title = None
language = None
description = None
link = None
articles = []
language = channel.get("xml:lang")
for node in channel:
if node.tag == "title":
title = node.text
elif node.tag == "link":
link = node.get("href")
elif node.tag == "description":
description = node.text
elif node.tag == "item":
articles.append(parse_item(node))
else:
print("%s element not handled" % node.tag)
return dict(articles=articles, title=title, language=language, description=description, link=link)