Пытаюсь сделать мультипоточный скрипт, который забирает информацию из базы- Postgresql(линки на вебсайты) проходит по ним, собирает информацию и вносит её снова в БД.
Мои наброски:
iimport urllib2
from bs4 import BeautifulSoup
import psycopg2
import threading
def scrape(link, id):
# print link, id
# connect to database
connection = psycopg2.connect(database = "contacts", user = "???", password = "???", host="localhost", port="5432")
# create new cursor
curs = connection.cursor()
# headers for opening links
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
# try open link
try:
# connect to the page and get request
req = urllib2.Request(link, headers = hdr)
page = urllib2.urlopen(req, timeout = 30) # todo timeout
html = page.read()
# get soup/html code of the page
soup = BeautifulSoup(html, "html.parser")
# finding title on the page
try:
title = soup.find('title')
title = title.text
print 'TITLE: ' + title
except:
title = ''
print "Can't get title!"
# finding meta keywords on the page
try:
meta_keywords = soup.find('meta', attrs = {"name" : "keywords"})
meta_keywords = meta_keywords['content']
print 'META KEYWORDS: ' + meta_keywords
except:
meta_keywords = ''
print "Can't get meta keywords!"
# finding meta description on the page
try:
meta_description = soup.find('meta', attrs = {"name" : "description"})
meta_description = meta_description['content']
print 'META DESCR:' + meta_description
except:
meta_description = ''
print "Can't get meta description."
# update database with new information
query = "UPDATE app_contacts SET visited = %s, title = %s, meta_keywords = %s, meta_description = %s WHERE id = %s AND url = %s;"
data = ("1", title, meta_keywords, meta_description, id, link[7:])
curs.execute(query, data)
connection.commit()
connection.close()
except:
print "Can't open link!"
if __name__ == '__main__':
conn = psycopg2.connect(database = "contacts", user = "???", password = "???", host="localhost", port="5432")
c = conn.cursor()
c.execute("SELECT id, url, role from app_contacts WHERE url!='' AND visited='0' order by id;")
for item in c.fetchall():
link = "http://" + item[1]
id = item[0]
t = threading.Thread(target = scrape, kwargs={"link":link, "id":id})
t.start()
Собственно проблема в записи, после запуска скрипт отрабатывает очень странно и отказывается записывать всю собранную информацию в БД. Что я делаю не так?