3.2 爬虫实战
Last updated
Was this helpful?
Last updated
Was this helpful?
Was this helpful?
links = BeautifulSoup(page).select('a')
urls = []
for link in links:
urls.append(link['href'])def extract_links(page):
links = BeautifulSoup(page).select('a')
urls = []
for link in links:
urls.append(link['href'])
return urlsdef get_page(url):
page = requests.get(url)
urls = extract_links(page.text)
for sub_url in urls:
get_page(sub_url)crawled = set()
def get_page(url):
if url in crawled:
return # 下次碰到相同的网页,它就不会再爬取了
crawled.add(url)
page = requests.get(url)
urls = extract_links(page.text)
for sub_url in urls:
get_page(sub_url)