120 lines
3.7 KiB
Python
120 lines
3.7 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# A project for parsing a website, then inserting the data into mysql and serving the mysql data through socket http server.
|
|
# All is managed through multithreading.
|
|
#
|
|
# If the mysql table does not exist, this script will create it, but the database and user should be created in advance!
|
|
|
|
import multiprocessing
|
|
import MySQLdb as mdb
|
|
from lxml import html
|
|
import requests
|
|
import time
|
|
from time import gmtime, strftime
|
|
import SocketServer
|
|
import SimpleHTTPServer
|
|
|
|
# Proxy
|
|
proxies = {
|
|
'http': "socks5://127.0.0.1:9050",
|
|
'https': "socks5://127.0.0.1:9050"}
|
|
|
|
# Spidering
|
|
def spidering_l(proc):
|
|
while True:
|
|
# Sprudeling
|
|
# Get the page
|
|
page = requests.get('https://riskdiscovery.com/'), proxies=proxies)
|
|
tree = html.fromstring(page.content)
|
|
# Vars
|
|
state_time = strftime("%Y%m%d%H%M%S", gmtime())
|
|
spider_list_link = []
|
|
spider_list_text = []
|
|
spider_state = str(state_time)
|
|
# Loop around links //a
|
|
for link in tree.xpath('//a'):
|
|
spider_list_link.append(link.get('href'))
|
|
spider_list_text.append(link.text)
|
|
# MySQL
|
|
con = ""
|
|
try:
|
|
con = mdb.connect('localhost', 'someuser', 'somepass', 'news_parse')
|
|
cur = con.cursor()
|
|
zp = zip(spider_list_link,spider_list_text)
|
|
for zipo in zp:
|
|
link_c = zipo[0]
|
|
text_c = zipo[1]
|
|
cur.execute("CREATE TABLE IF NOT EXISTS news_data ( id INT(6) UNSIGNED AUTO_INCREMENT PRIMARY KEY, state_time VARCHAR(30) NOT NULL, link VARCHAR(512) NOT NULL, text VARCHAR(1024))")
|
|
con.commit()
|
|
cur.execute("INSERT INTO news_data (id, state_time, link, text) VALUES (NULL, %s, %s, %s)",(state_time,link_c,text_c))
|
|
con.commit()
|
|
except mdb.Error, e:
|
|
print e
|
|
finally:
|
|
if con:
|
|
con.close()
|
|
time.sleep(60)
|
|
return
|
|
|
|
|
|
# Serve HTTP
|
|
class HttpRequestHandler (SimpleHTTPServer.SimpleHTTPRequestHandler) :
|
|
def do_GET(self) :
|
|
last_list=""
|
|
if self.path == "/":
|
|
try:
|
|
con = mdb.connect('localhost', 'someuser', 'somepass', 'news_parse')
|
|
cur = con.cursor()
|
|
cur.execute("SELECT * FROM news_data")
|
|
data = cur.fetchall()
|
|
|
|
# Unique identifiers (that's why it's a set)
|
|
sset=set()
|
|
for x in data:
|
|
sset.add(x[2])
|
|
for xx in sset:
|
|
last_list = last_list + xx + '\n'
|
|
|
|
# Note, playing with sets
|
|
# final_out = []
|
|
# for xx in sset:
|
|
# curr_st = xx
|
|
# for x in data:
|
|
# if curr_st == x[1]:
|
|
# final_out.(x[2][3])
|
|
|
|
|
|
except mdb.Error, e:
|
|
print e
|
|
finally:
|
|
if con:
|
|
con.close()
|
|
self.wfile.write('The latest news from the past 72 hours: \n' + str(last_list))
|
|
|
|
else:
|
|
SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
|
|
|
|
def serve_http(proc):
|
|
SocketServer.TCPServer.allow_reuse_address = True
|
|
httpServer = SocketServer.TCPServer(("127.0.0.1", 8080), HttpRequestHandler)
|
|
print "It should listen on 8080."
|
|
httpServer.serve_forever()
|
|
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
jobs = []
|
|
|
|
# Start spidering
|
|
mp = multiprocessing.Process(target=spidering_l, args=(0,))
|
|
jobs.append(mp)
|
|
mp.start()
|
|
|
|
# Serve the results on HTTP
|
|
hp = multiprocessing.Process(target=serve_http, args=(0,))
|
|
jobs.append(hp)
|
|
hp.start()
|