인공지능 공부/남박사의 파이썬 실전

(인프런) 파이썬 실전 네이버 블로그 검색 결과 크롤링 하기

from re import S
import requests
from bs4 import BeautifulSoup

def get_search_naver_blog(query, start_page=1, end_page=None):
    #11 = (2-1)*10+1
    #21 = (3-1)*10+1
    start = (start_page-1)*10+1
    query = "파이썬강좌"
    url = "https://search.naver.com/search.naver?where=view&query={}&start={}".format(query, start)
    r= requests.get(url)
    bs= BeautifulSoup(r.text, "lxml")

    lis = bs.select("li.sh_blog_top")

    # 41-50 / 6,670건 =>6670
    if end_page is None:
        tot_counts = bs.select("span.title_num")[0].text
        ##6,670건

        tot_counts = tot_counts.split("/")
        tot_counts= int(tot_counts.replace("건", "").replace(",", "").strip())
        end_page = tot_counts/10

        if end_page > 900:
            end_page = 900

    for i in lis:
        try:
            thumnail = li.select("img")[0]["src"]
            title = li.select("dl > dt > a")[0]
            summery = li.select("dl > dd.sh_blog_passage")[0].text

            title_link = title["href"]
            title_text = title.text

            result.append((thumnail, title_text, title_link, summery))
        except:
            continue
       
    if start_page < end_page:
        start_page +=1
        result.extend(get_search_naver_blog(query, start_page=start_page, end_page=end_page))
    return result

results = get_search_naver_blog("파이썬강좌",  start_page=1, end_page=3)
for result in results:
    print(result)