Добрый день, точнее, доброе утро!
Постепенно осваиваю 3-ий Питон, пытаюсь делать что-то полезное на нем. Но часто ловлю себя на мысли, что полностью «мыслить по-новому» на этом ЯП пока не получается. Да и код еще далек от идеала. Поэтому прошу помощи хабражителей указать на проблемные места.
Идея скрипта: простой парсер RSS c возможностью вывода как строчками, так и скачивать более-менее очищенные статьи по ссылкам в отдельные файлы.
1 #!/usr/bin/python3
2
3 ###
4 ### TODO: clean_text: NOT WORKING WELL
5 ### TODO: \x7f problem needs tests
6 ###
7
8 import codecs
9 import sys
10 import os
11 import feedparser
12 import urllib.request, urllib.error, urllib.parse
13 import re
14 import types
15 from bs4 import BeautifulSoup
16 from html.parser import HTMLParser
17
18 stop_string_list=("li><a","<div","href","<par","</div")
19
20 def clean_text(html_text):
21 def char_from_entity(match):
22 code = html.entities.name2codepoint.get(match.group(1), 0xFFFD)
23 return chr(code)
24
25 def clean_str(tmp_str):
26 while tmp_str.find("\x7f")>0:
27 pos=tmp_str.find("\x7f")
28 tmp_str=tmp_str[0:pos-1]+tmp_str[pos+1:]
29 return tmp_str
30
31 text = re.sub(r"<!--(?:.|\n)*?-->", "", html_text)
32 text = re.sub(r"<[Pp][^>]*?(?!</)>", "\n\n", text)
33 text = re.sub(r"<[^>]*?>", "", text)
34 text = re.sub(r"(\d+);", lambda m: chr(int(m.group(1))), text)
35 text = re.sub(r"&([A-Za-z]+);", char_from_entity, text)
36 text = re.sub(r"\n(?:[ \xA0\t]+\n)+", "\n", text)
37 t = re.sub(r"\n\n+", "\n\n", text.strip())
38 return clean_str(t)
39
40 def visible(element):
41 if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'dates']:
42 return False
43 elif re.match('<!--.*-->', str(element),re.UNICODE):
44 return False
45 return True
46
47 if(sys.argv[1:] == []):
48
49 print("Simple RSS Parser")
50 print("")
51 print("Usage:")
52 print("")
53 print(" rss_parser (rss_link)(mode) [encoding]")
54 print("")
55 print("Possible modes:")
56 print("")
57 print(" line - line by line output(format: YYYY-MM-DD|header|body|link)")
58 print(" pandoc - document compatible with Pandoc")
59 print(" plain - plain text output")
60 print(" dump - dump HTML content using the specified encoding")
61 print(" fplain - dump date, header, body and link to separate files using the specified encoding")
62 print(" fdump - dump HTML content to separate files using the specified encoding")
63 print("")
64
65 else:
66
67 try:
68 rss_link = sys.argv[1]
69 out_format = sys.argv[2]
70 except:
71 sys.exit("ERROR: Some argument is missing.")
72
73 try:
74 d = feedparser.parse(rss_link)
75 except:
76 sys.exit("ERROR: Unable to parse RSS file. Check the RSS link and internet connection.")
77
78 if(out_format == "line"):
79
80 for entry in d["entries"]:
81 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
82 print(time_stamp + " | " + entry.title + " | " + clean_text(entry.summary) + " | " + entry.link)
83
84 elif(out_format == "pandoc"):
85
86 for entry in d["entries"]:
87 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
88 print("#", time_stamp, " ",entry.title, "#")
89 print("")
90 print(clean_text(entry.summary))
91 print("")
92 print(entry.link)
93 print("")
94 print("")
95
96 elif(out_format == "plain"):
97
98 for entry in d["entries"]:
99 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
100 print(time_stamp, " ", entry.title)
101 print(clean_text(entry.summary))
102 print(entry.link)
103 print("")
104 print("")
105
106 elif(out_format == "dump"):
107
108 try:
109 page_enc = sys.argv[3]
110 except:
111 sys.exit("ERROR: Encoding not specified.")
112
113 for entry in d["entries"]:
114 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
115 print(time_stamp, " ", entry.title)
116 print(clean_text(entry.summary))
117 print(entry.link)
118 f=urllib.request.urlopen(entry.link)
119 print(clean_text(f.read().decode(page_enc)))
120 print("")
121 print("")
122
123 elif(out_format == "fplain"):
124
125 wrk_dir = os.getcwd()
126
127 for entry in d["entries"]:
128 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
129 # prepare file name for file: date + 8 chars of title
130 f_name = entry.title
131 f_name = time_stamp + f_name.replace(" ", "")[0:8]
132
133 not_exist=False
134 try:
135 open(wrk_dir + "/" + f_name)
136 except IOError:
137 not_exist=True
138
139 if (not_exist):
140 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
141
142 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
143 f_hndl.write(clean_text(entry.summary) + "\n\n")
144 f_hndl.write(entry.link + "\n")
145 f_hndl.close()
146
147 elif(out_format == "fdump"):
148
149 try:
150 page_enc = sys.argv[3]
151 except:
152 sys.exit("ERROR: Encoding not specified.")
153
154 wrk_dir = os.getcwd()
155
156 for entry in d["entries"]:
157 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
158 # prepare file name for file: date + 8 chars of title
159 f_name = entry.title
160 f_name = time_stamp + f_name.replace(" ", "")[0:8]
161
162 not_exist=False
163 try:
164 open(wrk_dir + "/" + f_name)
165 except IOError:
166 not_exist=True
167
168 if (not_exist):
169 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
170
171 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
172 f_hndl.write(clean_text(entry.summary) + "\n\n")
173 f_hndl.write(entry.link + "\n\n")
174
175 web_page=urllib.request.urlopen(entry.link)
176 page=web_page.read().decode(page_enc)
177
178 soup=BeautifulSoup(page)
179 texts=soup.findAll(text=True)
180 visible_texts = filter(visible, texts)
181
182 out_file = ""
183 for item in visible_texts:
184 st_item=str(item)
185 not_in_stop=True
186 for stop_item in stop_string_list:
187 not_in_stop=not_in_stop and (st_item.find(stop_item)<0)
188 if not_in_stop:
189 out_file+=item
190
191 out_file.replace("\t","")
192 out_file=re.sub("\n\s*\n*", "\n", out_file)
193
194 f_hndl.write(out_file)
195 f_hndl.close()
196
197 else:
198 print("ERROR: Argument not defined.")
199
То же самое на пэйстбине:
pastebin.com/fjedL8pe