Python_2_Examples_and_Notes/33_web_parsing_beautifulsou...

45 lines
964 B
Python
Raw Permalink Normal View History

2017-10-11 20:07:24 +00:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
# GET with requests, then parse with BeautifulSoup
r = requests.get("https://gentoo.org/")
print r.content
bt = BeautifulSoup(r.content, "lxml") # It's recommended to use LXML, not the default html parser.
print bt.title
print bt.title.string
# Find all metatags
allMetaTags = bt.find_all('meta')
print allMetaTags
allMetaTags = bt.find_all('meta')
print allMetaTags[2]
#allMetaTags = bt.find_all('meta')
#print allMetaTags[0]['content'] # Works in video, not here. wat?
# Print all links from the site
allLinks = bt.find_all('a')
print len(allLinks) # How many links do we have?
#print allLinks[1]
print allLinks[4]['href']
#print allLinks[1].string
# Print all text output, could be great for password list generations
print bt.get_text()
#Print all links
for link in allLinks:
print link['href']
# print bt.meta.next.next.next.next.next.next # Don't.