2016년 12월 1일 목요일

[Python] 웹사이트 읽기

hello python!!



#1. Default
from urllib.request import urlopen
html = urlopen("http://naver.com")
print(html.read())



#2. Use BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://naver.com")
bsObj = BeautifulSoup(html.read(), "html.parser")
print(bsObj.h1)



#3. Set user_agent & 404회피
from urllib.request import Request, urlopen, HTTPError
from bs4 import BeautifulSoup
## 404 에러 회피
contents = None
try:
    url = "https://ko.wikipedia.org"
    user_agent = "Mozilla/5.0..."
    request = Request(url)
    request.add_header('User-Agent', user_agent)
    contents = urlopen(request).read()
except HTTPError as e:
    contents = e.fp.read()
## href 추출
bsObj = BeautifulSoup(contents, "html.parser")
for link in bsObj.findAll("a"):
    if 'href' in link.attrs:
        print(link.attrs['href'])

댓글 없음:

댓글 쓰기