TED에 등록된 최근 톡들 200개를 가지고 와서는 그들의 스크립트에서 가장많이 사용된 단어들을 추출하여 구글 스프레드 시트에 입력하는 예제입니다.
# -*- coding: utf-8 -*- import json import re import gspread import requests import nltk import sys import codecs import string import time from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from bs4 import BeautifulSoup from oauth2client.client import SignedJwtAssertionCredentials #nltk.download('punkt') #nltk.download('stopwords') #구글 콘솔에서 내려받은 json 파일을 지정한다 json_key = json.load(open('mykey.json')) scope = ['https://spreadsheets.google.com/feeds'] credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), scope) TED_TALK_URL = 'http://www.ted.com/talks' TED_URL = 'http://www.ted.com/' SPREAD_SHEET_NAME = "TEST" #총 200개의 TED 스크립트를 가지고 온다. ALLCOUNT = 200 #TED 사이트의 페이지 (페이지당 36개의 TALK이 리스트업된다) MAX_PAGE = 100 #4초 단위로 스크립트를 읽는다. 너무 빠르면 봇으로 간주되어 접속이 안된다 ㅋ TIME_SLEEP = 4 punctuations = list(string.punctuation) default_stopwords = set(nltk.corpus.stopwords.words('english')) #엑셀의 첫번째 시트를 연다 def openExcel(): gc = gspread.authorize(credentials) worksheet = gc.open(SPREAD_SHEET_NAME).sheet1 return worksheet #해당 url로 부터 html tag를 획득한다 def get_soup(url): source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'lxml') return soup #스크립트를 가지고 와서 문장이 포함하는 단어들을 리스트에 담는다 def script_reader(url): curl = TED_URL + url + '/transcript?language=en' soup = get_soup(curl) words = [] for spans in soup.find_all('span', class_="talk-transcript__fragment"): words += word_tokenize(spans.text) # Remove single-character tokens (mostly punctuation) words = [word for word in words if len(word) > 1] # Remove numbers words = [word for word in words if not word.isnumeric()] # Lowercase all words (default_stopwords are lowercase too) words = [word.lower() for word in words] # Remove stopwords words = [word for word in words if word not in default_stopwords] # Remove punctuation words = [word for word in words if word not in punctuations] print(" + " + str(len(words))) return words #단어와 빈도수가 기록된 리스트를 엑셀에 쓴다 def writeListToExcel(words): wks = openExcel() wks.add_rows(len(words)) cell_list = wks.range('A1:B%d' % len(words)) idx = 0 for word, frequency in words: cell_list[idx].value = word idx += 1 cell_list[idx].value = frequency idx += 1 wks.update_cells(cell_list) print("done") #최근에 등록된 톡들의 스크립트들을 가지고 온다 def get_latest_ted(url): wholewords = [] talk = 0 # 36 scripts / page for page in range(1,MAX_PAGE): realurl = url + "?page=" + str(page) soup = get_soup(realurl) for h4 in soup.find_all('h4', class_="h9 m5"): ted_talks_url = h4.a.get('href') wholewords += script_reader(ted_talks_url) talk += 1 print(str(page) + " page: " + str(talk) + " scripts are analyzed") time.sleep(TIME_SLEEP) if talk >= ALLCOUNT: print("ALL = " + str(len(wholewords)) + " words") en = nltk.FreqDist(wholewords) writeListToExcel(en.most_common(100)) return en = nltk.FreqDist(wholewords) writeListToExcel(en.most_common(100)) get_latest_ted(TED_TALK_URL)