Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Fri May 29 00:38:13 2020 | |
| @author: ASUS | |
| """ | |
| # 導入 模組(module) | |
| import requests | |
| # 導入 BeautifulSoup 模組(module):解析HTML 語法工具 | |
| import bs4 | |
| # 文章連結 | |
| # URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html" | |
| URL = """https://www.ptt.cc/bbs/Gossiping/index.html""" | |
| from urllib.parse import urlparse | |
| def get_host(URL): | |
| parsed_uri = urlparse(URL) | |
| result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri) | |
| return result | |
| def proc(ch, HOSTNAME): | |
| try: | |
| [title_a] = ch.select('.title' )[0].select('a') | |
| except ValueError as err: | |
| return | |
| return dict( | |
| title= title_a.getText(), | |
| link= HOSTNAME + '/' + title_a.attrs['href'], | |
| author=ch.select('.author')[0].getText(), | |
| date= ch.select('.date' )[0].getText(), | |
| ) | |
| def getall(URL): | |
| HOSTNAME = get_host(URL) | |
| # 設定Header與Cookie | |
| # my_headers = {'cookie': 'over18=1;'} | |
| cookies = { | |
| 'over18': '1' | |
| } | |
| # 發送get 請求 到 ptt 八卦版 | |
| response = requests.get(URL, | |
| # headers = my_headers | |
| cookies=cookies | |
| ) | |
| # 把網頁程式碼(HTML) 丟入 bs4模組分析 | |
| soup = bs4.BeautifulSoup(response.text,"html.parser") | |
| all_articles = soup.find("div", class_="r-list-container action-bar-margin bbs-screen") | |
| mu = [] | |
| for ch in all_articles.children: # .select('div'): | |
| if isinstance(ch, bs4.element.Tag): | |
| if ch.attrs['class'] == ['r-ent']: | |
| output = proc(ch, HOSTNAME) | |
| if output: | |
| mu.append(output) | |
| elif ch.attrs['class'] == ['r-list-sep']: | |
| break | |
| buttons = soup.select('a.btn.wide') | |
| prev_page, next_page = None, None | |
| for button in buttons: | |
| if '上頁' in button.getText(): | |
| if 'disabled' not in button.attrs['class']: | |
| prev_page = HOSTNAME + '/' + button.attrs['href'] | |
| else: | |
| prev_page = None | |
| if '下頁' in button.getText(): | |
| if 'disabled' not in button.attrs['class']: | |
| next_page = HOSTNAME + '/' + button.attrs['href'] | |
| else: | |
| next_page = None | |
| return mu, prev_page, next_page | |
| URL = """https://www.ptt.cc/bbs/Gossiping/index.html""" | |
| RR = [] | |
| from tqdm import tqdm | |
| for iiii in tqdm(range(100)): | |
| res, prev_, next_ = getall(URL) | |
| print(res[0]["date"], end='\t') | |
| print(res[-1]["date"]) | |
| URL = prev_ | |
| RR.extend(res) |