Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| import re | |
| import requests as r | |
| from html2text import html2text | |
| import tqdm | |
| def process_url(url): | |
| """Process a single URL to fetch answers.""" | |
| try: | |
| response = r.get(url) | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # answers = [] | |
| # for idx in range(1, 100): | |
| # answer = soup.find('div', {'id': f'answer_{idx}'}) | |
| # if answer: | |
| # answers.append(answer) | |
| # else: | |
| # break | |
| answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')}) | |
| answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify())) | |
| for answer in answers if answer.find('div', {'class': "answerDetail"})] | |
| title = soup.find('div', {'class': 'endTitleSection'}).text.strip() | |
| questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip() | |
| # print("Question: ", questionDetails, '\n') | |
| title = title.replace("μ§λ¬Έ", '').strip() | |
| print("Answers extracted from: \n", url) | |
| print(len(answers)) | |
| print('-'*60) | |
| return { | |
| "title": title, | |
| "questionDetails": questionDetails, | |
| "url": url, | |
| "answers": answers | |
| } | |
| except Exception as e: | |
| print(f"Error processing URL {url}: {e}") | |
| with open('error_urls.txt', 'w') as f: | |
| f.write(url + '\n') | |
| return {"title": '', "questionDetails": '', "url": url, "answers": ''} | |
| def get_answers(results_a_elements, query): | |
| """Fetch answers for all the extracted result links.""" | |
| if not results_a_elements: | |
| print("No results found.") | |
| return [] | |
| print("Result links extracted: ", len(results_a_elements)) | |
| # Limit the number of parallel processes for better resource management | |
| # max_processes = 4 | |
| # with multiprocessing.Pool(processes=max_processes) as pool: | |
| # results = pool.map(process_url, results_a_elements) | |
| results = [] | |
| # answer_count = 0 | |
| for url in tqdm.tqdm(results_a_elements): | |
| res = process_url(url) | |
| results.append(res) | |
| # answer_count += len(res['answers']) | |
| return results | |
| def get_search_results(query, num_pages): | |
| """Fetch search results for the given query from Naver μ§μin.""" | |
| results = [] | |
| for page in range(1, num_pages + 1): | |
| url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}" | |
| print("Starting the scraping process for:\n", url) | |
| try: | |
| response = r.get(url) | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"}) | |
| results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")] | |
| results += results_a_elements | |
| except Exception as e: | |
| print(f"Error while fetching search results: {e}") | |
| return results | |
| def extract_data(query, num_pages=150) -> list[dict[str, object]]: | |
| results_a_elements = get_search_results(query, num_pages) | |
| answers = get_answers(results_a_elements, query) | |
| print("Total answers collected:", len(answers)) | |
| return answers | |
| # if __name__ == "__main__": | |
| # start = time.time() | |
| # query = "μ₯λν¬λ§, μΈκ³΅μ§λ₯ κ°λ°μ/μ°κ΅¬μ, νμ΄μ¬, μ€νμ μμ€, νμ΄μ¬ μ€μΉ, λμ μΆμ²" | |
| # answers = process_query(query) | |
| # print("Total answers collected:", len(answers)) | |
| # print("Time taken: ", time.time() - start) | |
| # # print(answers) | |
| # AJAX URL: | |
| # https://kin.naver.com/ajax/detail/answerList.naver? | |
| # dirId=401030201&docId=292159869 | |
| # &answerSortType=DEFAULT&answerViewType=DETAIL | |
| # &answerNo=&page=2&count=5&_=1736131792605 |