#1. Default
from urllib.request import urlopen
html = urlopen("http://naver.com")
print(html.read())
#2. Use BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://naver.com")
bsObj = BeautifulSoup(html.read(), "html.parser")
print(bsObj.h1)
print(bsObj.h1)
#3. Set user_agent & 404회피
from urllib.request import Request, urlopen, HTTPError
from bs4 import BeautifulSoup
## 404 에러 회피
contents = None
try:
url = "https://ko.wikipedia.org"
user_agent = "Mozilla/5.0..."
request = Request(url)
request.add_header('User-Agent', user_agent)
contents = urlopen(request).read()
except HTTPError as e:
contents = e.fp.read()
## href 만 추출
bsObj = BeautifulSoup(contents, "html.parser")
for link in bsObj.findAll("a"):
if 'href' in link.attrs:
print(link.attrs['href'])
댓글 없음:
댓글 쓰기