Dev Log/Python
[Dev log] Python Crawling
Godwony
2020. 4. 23. 10:27
반응형
Python Crawling으로 연예인 이름 crawling을 하자
m.search.daum.net의 #연예인명단 이라는 해시태그 페이지를 이용하여 Crawling 했습니다.
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import urllib.request
import time, sys, codecs, os, random, csv
def celebrities_names(start,end):
randomsl = random.uniform(1,3)
hdr = {'User-Agent': 'Mozilla/5.0'}
url = "https://m.search.daum.net/kakao?w=smok&DA=AQJ&q=%EC%97%B0%EC%98%88%EC%9D%B8%EB%AA%85%EB%8B%A8&sidx="
namesList = []
for j in tqdm(range(start, end, 10)) :
page = j
req = urllib.request.Request(url + str(page), headers=hdr)
html = urllib.request.urlopen(req).read()
soup = bs(html, 'html.parser')
time.sleep(randomsl)
wrap_cont = soup.select('.wrap_cont')
for i in wrap_cont :
temp = []
try :
temp.append(i.select_one('.f_link_tit').text) # name
temp.append(i.select_one('.cont.f_eb').text) # job
temp.append('https://m.search.daum.net/kakao' + i.find('a')['href']) # profile link
temp.append(i.select_one('.cont.f_eb.ff_hel').text) # birth
except :
temp.append('NaN') # birth가 없는사람도 있어서 try 처리
namesList.append(temp)
with open('celebritiesname' + str(end) + '.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['name', 'job', 'link', 'birth'])
writer.writerows(namesList)
if __name__ == "__main__":
start = int(input('Start page 10x input :'))
end = int(input('How many people will you search : '))
celebrities_names(start, end)
반응형