You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.9 KiB
Python

import requests
from bs4 import BeautifulSoup
import logging
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import re
# Set up logging configuration
logging.basicConfig(filename='unreal_docs.log', level=logging.INFO, filemode='w')
# Function to fetch the HTML content of a given URL
def fetch_page_content(url):
try:
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get(url)
time.sleep(10)
content = driver.page_source
driver.quit()
return content
except Exception as e:
logging.error(f"Error fetching {url}: {e}")
# Function to parse the HTML content and extract the page text
def parse_html(content, url):
soup = BeautifulSoup(content, 'html.parser')
# print(soup) # For debugging purposes only
body_div = soup.find("div", {"id": "maincol"})
if not body_div:
logging.warning(f"body div not successfully located in {url}")
#print(body_div) # For debugging purposes only
elements = body_div.find_all(['p', 'h1', 'h2', 'h3'])
if not elements:
logging.warning(f"No tags found on {url}")
return
# Extract the text from each paragraph and remove consecutive whitespace characters
texts = [re.sub(r'\s+', ' ', e.get_text().strip()) for e in elements]
return '\n\n'.join(texts) + '\n\n'
# Main function to loop through the list of page URLs, fetch and parse their content,
# extract the content, and write it to a file
def main():
# URLs of the pages to include in the .txt file
with open('urls.txt', 'r') as f:
page_urls = f.read().splitlines()
# Initialize a string to store the content
docs_text = ''
# Loop through the list of page URLs
for i, url in enumerate(page_urls):
# Log the current page being processed
logging.info(f"Processing page {i+1}: {url}")
# Fetch the HTML content of the page
content = fetch_page_content(url)
# If the content could not be fetched, continue to the next page
if not content:
continue
# Parse the HTML content and extract the maincol div
paragraphs = parse_html(content, url)
# If the page container could not be found, continue to the next page
if not paragraphs:
continue
# Append the content to the string storing all content
docs_text += paragraphs
print(f'Successfully parsed page {i+1}')
# Write the content to a file
try:
with open('unreal_docs.txt', 'w', encoding='utf-8') as f:
f.write(docs_text)
logging.info(f"Successfully wrote {len(page_urls)} pages to file.")
except OSError as e:
logging.error(f"Error writing to file: {e}")
print("Script execution completed successfully.")
if __name__ == '__main__':
main()