Python_2_Examples_and_Notes/spse_excercises/exc_mod4_p7_multithread_web...

#!/usr/bin/python
# -*- coding: utf-8 -*-

# A project for parsing a website, then inserting the data into mysql and serving the mysql data through socket http server.
# All is managed through multithreading.
#
# If the mysql table does not exist, this script will create it, but the database and user should be created in advance!

import multiprocessing
import MySQLdb as mdb
from lxml import html
import requests
import time
from time import gmtime, strftime
import SocketServer
import SimpleHTTPServer

# Proxy
proxies = {
    'http': "socks5://127.0.0.1:9050",
    'https': "socks5://127.0.0.1:9050"}

# Spidering
def spidering_l(proc):
    while True:
        # Sprudeling
        # Get the page
        page = requests.get('https://riskdiscovery.com/'), proxies=proxies)
        tree = html.fromstring(page.content)
        # Vars
        state_time = strftime("%Y%m%d%H%M%S", gmtime())
        spider_list_link = []
        spider_list_text = []
        spider_state = str(state_time)
        # Loop around links //a
        for link in tree.xpath('//a'):
            spider_list_link.append(link.get('href'))
            spider_list_text.append(link.text)
        # MySQL
        con = ""
        try:
            con = mdb.connect('localhost', 'someuser', 'somepass', 'news_parse')
            cur = con.cursor()
            zp = zip(spider_list_link,spider_list_text)
            for zipo in zp:
                link_c = zipo[0]
                text_c = zipo[1]
                cur.execute("CREATE TABLE IF NOT EXISTS news_data ( id INT(6) UNSIGNED AUTO_INCREMENT PRIMARY KEY, state_time VARCHAR(30) NOT NULL, link VARCHAR(512) NOT NULL, text VARCHAR(1024))")
                con.commit()
                cur.execute("INSERT INTO news_data (id, state_time, link, text) VALUES (NULL, %s, %s, %s)",(state_time,link_c,text_c))
                con.commit()
        except mdb.Error, e:
            print e
        finally:
            if con:
                con.close()
        time.sleep(60)
    return


# Serve HTTP
class HttpRequestHandler (SimpleHTTPServer.SimpleHTTPRequestHandler) :
    def do_GET(self) :
        last_list=""
        if self.path == "/":
            try:
                con = mdb.connect('localhost', 'someuser', 'somepass', 'news_parse')
                cur = con.cursor()
                cur.execute("SELECT * FROM news_data")
                data = cur.fetchall()

                # Unique identifiers (that's why it's a set)
                sset=set()
                for x in data:
                    sset.add(x[2])
                for xx in sset:
                    last_list = last_list + xx + '\n'

# Note, playing with sets
#                final_out = []
#                for xx in sset:
#                    curr_st = xx
#                        for x in data:
#                            if curr_st == x[1]:
#                                final_out.(x[2][3])


            except mdb.Error, e:
                print e
            finally:
                if con:
                    con.close()
            self.wfile.write('The latest news from the past 72 hours: \n' + str(last_list))
            
        else:
            SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)

def serve_http(proc):
    SocketServer.TCPServer.allow_reuse_address = True
    httpServer = SocketServer.TCPServer(("127.0.0.1", 8080), HttpRequestHandler)
    print "It should listen on 8080."
    httpServer.serve_forever()

    pass


if __name__ == '__main__':

    jobs = []

    # Start spidering
    mp = multiprocessing.Process(target=spidering_l, args=(0,))
    jobs.append(mp)
    mp.start()

    # Serve the results on HTTP
    hp = multiprocessing.Process(target=serve_http, args=(0,))
    jobs.append(hp)
    hp.start()
Excercises for the SPSE exam. 2017-10-12 19:26:45 +00:00			`#!/usr/bin/python`
			`# -- coding: utf-8 --`

			`# A project for parsing a website, then inserting the data into mysql and serving the mysql data through socket http server.`
			`# All is managed through multithreading.`
			`#`
			`# If the mysql table does not exist, this script will create it, but the database and user should be created in advance!`

			`import multiprocessing`
			`import MySQLdb as mdb`
			`from lxml import html`
			`import requests`
			`import time`
			`from time import gmtime, strftime`
			`import SocketServer`
			`import SimpleHTTPServer`

			`# Proxy`
			`proxies = {`
			`'http': "socks5://127.0.0.1:9050",`
			`'https': "socks5://127.0.0.1:9050"}`

			`# Spidering`
			`def spidering_l(proc):`
			`while True:`
			`# Sprudeling`
			`# Get the page`
			`page = requests.get('https://riskdiscovery.com/'), proxies=proxies)`
			`tree = html.fromstring(page.content)`
			`# Vars`
			`state_time = strftime("%Y%m%d%H%M%S", gmtime())`
			`spider_list_link = []`
			`spider_list_text = []`
			`spider_state = str(state_time)`
			`# Loop around links //a`
			`for link in tree.xpath('//a'):`
			`spider_list_link.append(link.get('href'))`
			`spider_list_text.append(link.text)`
			`# MySQL`
			`con = ""`
			`try:`
			`con = mdb.connect('localhost', 'someuser', 'somepass', 'news_parse')`
			`cur = con.cursor()`
			`zp = zip(spider_list_link,spider_list_text)`
			`for zipo in zp:`
			`link_c = zipo[0]`
			`text_c = zipo[1]`
			`cur.execute("CREATE TABLE IF NOT EXISTS news_data ( id INT(6) UNSIGNED AUTO_INCREMENT PRIMARY KEY, state_time VARCHAR(30) NOT NULL, link VARCHAR(512) NOT NULL, text VARCHAR(1024))")`
			`con.commit()`
			`cur.execute("INSERT INTO news_data (id, state_time, link, text) VALUES (NULL, %s, %s, %s)",(state_time,link_c,text_c))`
			`con.commit()`
			`except mdb.Error, e:`
			`print e`
			`finally:`
			`if con:`
			`con.close()`
			`time.sleep(60)`
			`return`


			`# Serve HTTP`
			`class HttpRequestHandler (SimpleHTTPServer.SimpleHTTPRequestHandler) :`
			`def do_GET(self) :`
			`last_list=""`
			`if self.path == "/":`
			`try:`
			`con = mdb.connect('localhost', 'someuser', 'somepass', 'news_parse')`
			`cur = con.cursor()`
			`cur.execute("SELECT * FROM news_data")`
			`data = cur.fetchall()`

			`# Unique identifiers (that's why it's a set)`
			`sset=set()`
			`for x in data:`
			`sset.add(x[2])`
			`for xx in sset:`
			`last_list = last_list + xx + '\n'`

			`# Note, playing with sets`
			`# final_out = []`
			`# for xx in sset:`
			`# curr_st = xx`
			`# for x in data:`
			`# if curr_st == x[1]:`
			`# final_out.(x[2][3])`


			`except mdb.Error, e:`
			`print e`
			`finally:`
			`if con:`
			`con.close()`
			`self.wfile.write('The latest news from the past 72 hours: \n' + str(last_list))`

			`else:`
			`SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)`

			`def serve_http(proc):`
			`SocketServer.TCPServer.allow_reuse_address = True`
			`httpServer = SocketServer.TCPServer(("127.0.0.1", 8080), HttpRequestHandler)`
			`print "It should listen on 8080."`
			`httpServer.serve_forever()`

			`pass`


			`if __name__ == '__main__':`

			`jobs = []`

			`# Start spidering`
			`mp = multiprocessing.Process(target=spidering_l, args=(0,))`
			`jobs.append(mp)`
			`mp.start()`

			`# Serve the results on HTTP`
			`hp = multiprocessing.Process(target=serve_http, args=(0,))`
			`jobs.append(hp)`
			`hp.start()`