Есть парсер:
# -*- coding: utf-8 -*-
import requests
import lxml.html
class Rutor:
def __init__(self, title, year='', qu=''):
self.title = title
self.year = year
self.qu = qu
self.main_domain = 'http://www.rutor.org/'
self.search_params = '/search/0/1/100/0/' # only New movies
self.search_text = ""
self.count = 0
self.result = {}
def construct_search_text(self):
l = [self.title, self.year, self.qu]
l = filter(None, l)
search_text = " ".join(l)
self.search_text = search_text
return self.search_text[:] # [:] - magic
def construct_search_url(self):
search_link = "".join((self.main_domain, self.search_params, self.construct_search_text()))
print(search_link)
return search_link
def get_page_sourse(self):
r = requests.get(self.construct_search_url())
print("encoding is: "+r.encoding)
return r.text.encode(r.encoding) # r.encoding return used codec
def parse_it(self):
all_torrent_links_xpath = "//div[@id='index']//a[starts-with(@href, '/torrent')]"
page = lxml.html.document_fromstring(self.get_page_sourse())
print(self.get_page_sourse()) #here I printing source core for stackowerflow
all_torrent_links = page.xpath(all_torrent_links_xpath)
if all_torrent_links:
for link in all_torrent_links:
print(link)
if not (link.text.lower()).find(u'трейлер') != -1: # we don't need trailers
title = link.text_content()
torrent_file = link.getprevious().getprevious().attrib['href']
magnet = link.getprevious().attrib['href']
self.result[self.count] = {'title': title[:], 'torrent_file': torrent_file, 'magnet': magnet}
# I used [:] c'z title type is 'lxml.etree._ElementUnicodeResult' but not <unicode>
# because of lxml.html fromstring()
self.count += 1
if __name__ == '__main__':
m = Rutor('Avengers: Age of Ultron', '2015', '1080p')
m.parse_it()
print(m.result)
Если я его запускаю у себя на машине, то получаю красивый html и заполненный m.result
Однако, если я запускаю его через google app engine (flask):
import Rutor
...
@app.route('/test')
def test():
m = Rutor('Avengers: Age of Ultron', '2015', '1080p')
m.parse_it()
pprint(m.result)
return 'test'
То вместо исходного кода страницы, я получаю крякозябы:
и пустой m.result
Как это пофиксить?