import time import pandas as pd import logging import os from selenium import webdriver from selenium.webdriver.chrome.options import Options from urllib.parse import urlparse from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains import uuid from datetime import datetime import shutil # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # 配置 Chrome 无头模式 options = Options() options.add_argument('--headless') # 无头模式 service = Service(executable_path='/opt/homebrew/bin/chromedriver') # 获取浏览器驱动 def get_driver(): return webdriver.Chrome(service=service, options=options) def format_expiry_time(expiry): if expiry: try: # 转换为标准时间格式 return datetime.utcfromtimestamp(expiry).strftime('%Y-%m-%d %H:%M:%S') except Exception as e: logger.error(f"Error formatting expiry time: {e}") return 'Invalid Time' return 'N/A' # 获取页面的所有 a 标签和 Cookie def get_page_data(driver, url, output_dir): try: logger.info(f"Fetching page: {url}") driver.get(url) time.sleep(2) # 等待页面加载 # 获取页面的 Cookie cookies = driver.get_cookies() # 创建图片保存路径 image_dir = os.path.join(output_dir, 'images') os.makedirs(image_dir, exist_ok=True) # 截图保存 screenshot_filename = f"{uuid.uuid4()}.png" # 使用唯一的UUID作为文件名 screenshot_path = os.path.join(image_dir, screenshot_filename) driver.save_screenshot(screenshot_path) logger.info(f"Screenshot saved: {screenshot_path}") # 获取页面的所有 a 标签的 href links = driver.find_elements(By.TAG_NAME, 'a') hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href')] logger.info(f"Found {len(hrefs)} links on the page.") return hrefs, cookies, os.path.relpath(screenshot_path, output_dir) # 返回相对路径 except Exception as e: logger.error(f"Error fetching page data: {e}") return [], [], None # 生成单独的 Cookie 文件并返回其路径 def generate_cookie_file(cookies, base_url, output_dir): try: # 创建 Cookie 文件保存路径 cookie_dir = os.path.join(output_dir, 'cookie') os.makedirs(cookie_dir, exist_ok=True) # 创建 Cookie DataFrame cookie_data = [{'Name': cookie['name'], 'Domain': cookie['domain'], 'Expires': format_expiry_time(cookie.get('expiry'))} for cookie in cookies] cookie_df = pd.DataFrame(cookie_data) # 生成文件名并保存 Cookie 文件 unique_filename = f"{uuid.uuid4()}_cookies.xlsx" file_path = os.path.join(cookie_dir, unique_filename) cookie_df.to_excel(file_path, index=False, engine='openpyxl') logger.info(f"Cookie file saved: {file_path}") return os.path.relpath(file_path, output_dir) # 返回相对路径 except Exception as e: logger.error(f"Error generating cookie file: {e}") return None # 过滤掉与指定域名不一致的链接 def filter_links(hrefs, base_url): base_domain = urlparse(base_url).netloc filtered_links = set() # 使用 set 来去重 for href in hrefs: if urlparse(href).netloc == base_domain: filtered_links.add(href) # 使用 add() 自动去重 logger.info(f"Filtered {len(filtered_links)} links matching domain.") return list(filtered_links) # 生成主 Excel 文件 def generate_main_excel(data, output_file): try: df = pd.DataFrame(data) # 保存为 Excel 文件 df.to_excel(output_file, index=False, engine='openpyxl') logger.info(f"Main Excel file generated: {output_file}") except Exception as e: logger.error(f"Error generating main Excel file: {e}") # 主程序 def main(start_url, output_file): try: logger.info(f"Starting main process with start URL: {start_url}") # 获取浏览器驱动 driver = get_driver() # 确保输出目录存在 output_dir = os.path.dirname(os.path.abspath(output_file)) if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir, exist_ok=True) # 获取主页面的链接和 Cookie hrefs, cookies, screenshot_path = get_page_data(driver, start_url, output_dir) # 过滤链接并去重 filtered_links = filter_links(hrefs, start_url) filtered_links = filtered_links[:3] results = [] # 遍历过滤后的链接,获取每个链接的 Cookie for link in filtered_links: logger.info(f"Processing {link}...") hrefs, cookies, screenshot_path = get_page_data(driver, link, output_dir) # 生成 Cookie 文件 cookie_file_path = generate_cookie_file(cookies, link, output_dir) # 创建超链接 cookie_hyperlink = f'=HYPERLINK("{cookie_file_path}", "查看")' if cookie_file_path else 'No Cookie File' screenshot_hyperlink = f'=HYPERLINK("{screenshot_path}", "查看截图")' if screenshot_path else 'No Screenshot' # 添加数据到结果列表 results.append({ 'Page Path': link, 'Cookie File': cookie_hyperlink, 'Screenshot': screenshot_hyperlink }) # 生成主 Excel 文件 generate_main_excel(results, output_file) # 关闭浏览器 driver.quit() except Exception as e: logger.error(f"Error in main process: {e}") # 设置入口参数 if __name__ == '__main__': start_url = 'https://www.capcut.com/tools/desktop-video-editor?utm_medium=sem&utm_source=googleadwords_int&pid=359289&af_c_id=21157337217&adset_id=162157605753&ad_id=697948663363&placement=&keyword_name=capcut&targetid=kwd-1406970026529&matchtype=e&gad_source=1&gclid=Cj0KCQiA4fi7BhC5ARIsAEV1Yib0W39PL05K1QxR6WhUe87uIu7P0JmCcRdVxIFSlCxwxCGtJw_0sZMaAiIMEALw_wcB' # 你要测试的页面 output_file = 'output/cookies_comparison_with_links.xlsx' # 结果输出文件 main(start_url, output_file)