commit 4eb1564b91abdd98447de04d84f864c9fb807f66 Author: Owen Date: Thu Jan 16 21:15:36 2025 +0800 feat: 初次提交 diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..cfd17af Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b7754f8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv/ +output/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c66ea81 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +### 创建虚拟环境 +python3 -m venv venv source venv/bin/activate # macOS/Linux venv\Scripts\activate # Windows +### 安装依赖 +pip install -r requirements.txt \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..e0adb89 --- /dev/null +++ b/main.py @@ -0,0 +1,169 @@ +import time +import pandas as pd +import logging +import os +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from urllib.parse import urlparse +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.common.action_chains import ActionChains +import uuid +from datetime import datetime +import shutil + +# 配置日志 +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 配置 Chrome 无头模式 +options = Options() +options.add_argument('--headless') # 无头模式 +service = Service(executable_path='/opt/homebrew/bin/chromedriver') + +# 获取浏览器驱动 +def get_driver(): + return webdriver.Chrome(service=service, options=options) + +def format_expiry_time(expiry): + if expiry: + try: + # 转换为标准时间格式 + return datetime.utcfromtimestamp(expiry).strftime('%Y-%m-%d %H:%M:%S') + except Exception as e: + logger.error(f"Error formatting expiry time: {e}") + return 'Invalid Time' + return 'N/A' + +# 获取页面的所有 a 标签和 Cookie +def get_page_data(driver, url, output_dir): + try: + logger.info(f"Fetching page: {url}") + driver.get(url) + time.sleep(2) # 等待页面加载 + + # 获取页面的 Cookie + cookies = driver.get_cookies() + + # 创建图片保存路径 + image_dir = os.path.join(output_dir, 'images') + os.makedirs(image_dir, exist_ok=True) + + # 截图保存 + screenshot_filename = f"{uuid.uuid4()}.png" # 使用唯一的UUID作为文件名 + screenshot_path = os.path.join(image_dir, screenshot_filename) + driver.save_screenshot(screenshot_path) + logger.info(f"Screenshot saved: {screenshot_path}") + + # 获取页面的所有 a 标签的 href + links = driver.find_elements(By.TAG_NAME, 'a') + hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href')] + + logger.info(f"Found {len(hrefs)} links on the page.") + return hrefs, cookies, os.path.relpath(screenshot_path, output_dir) # 返回相对路径 + except Exception as e: + logger.error(f"Error fetching page data: {e}") + return [], [], None + +# 生成单独的 Cookie 文件并返回其路径 +def generate_cookie_file(cookies, base_url, output_dir): + try: + # 创建 Cookie 文件保存路径 + cookie_dir = os.path.join(output_dir, 'cookie') + os.makedirs(cookie_dir, exist_ok=True) + + # 创建 Cookie DataFrame + cookie_data = [{'Name': cookie['name'], 'Domain': cookie['domain'], 'Expires': format_expiry_time(cookie.get('expiry'))} for cookie in cookies] + cookie_df = pd.DataFrame(cookie_data) + + # 生成文件名并保存 Cookie 文件 + unique_filename = f"{uuid.uuid4()}_cookies.xlsx" + file_path = os.path.join(cookie_dir, unique_filename) + cookie_df.to_excel(file_path, index=False, engine='openpyxl') + + logger.info(f"Cookie file saved: {file_path}") + return os.path.relpath(file_path, output_dir) # 返回相对路径 + except Exception as e: + logger.error(f"Error generating cookie file: {e}") + return None + +# 过滤掉与指定域名不一致的链接 +def filter_links(hrefs, base_url): + base_domain = urlparse(base_url).netloc + filtered_links = set() # 使用 set 来去重 + + for href in hrefs: + if urlparse(href).netloc == base_domain: + filtered_links.add(href) # 使用 add() 自动去重 + + logger.info(f"Filtered {len(filtered_links)} links matching domain.") + return list(filtered_links) + +# 生成主 Excel 文件 +def generate_main_excel(data, output_file): + try: + df = pd.DataFrame(data) + + # 保存为 Excel 文件 + df.to_excel(output_file, index=False, engine='openpyxl') + logger.info(f"Main Excel file generated: {output_file}") + except Exception as e: + logger.error(f"Error generating main Excel file: {e}") + +# 主程序 +def main(start_url, output_file): + try: + logger.info(f"Starting main process with start URL: {start_url}") + + # 获取浏览器驱动 + driver = get_driver() + + # 确保输出目录存在 + output_dir = os.path.dirname(os.path.abspath(output_file)) + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + os.makedirs(output_dir, exist_ok=True) + + # 获取主页面的链接和 Cookie + hrefs, cookies, screenshot_path = get_page_data(driver, start_url, output_dir) + + # 过滤链接并去重 + filtered_links = filter_links(hrefs, start_url) + filtered_links = filtered_links[:3] + + results = [] + + # 遍历过滤后的链接,获取每个链接的 Cookie + for link in filtered_links: + logger.info(f"Processing {link}...") + hrefs, cookies, screenshot_path = get_page_data(driver, link, output_dir) + + # 生成 Cookie 文件 + cookie_file_path = generate_cookie_file(cookies, link, output_dir) + + # 创建超链接 + cookie_hyperlink = f'=HYPERLINK("{cookie_file_path}", "查看")' if cookie_file_path else 'No Cookie File' + screenshot_hyperlink = f'=HYPERLINK("{screenshot_path}", "查看截图")' if screenshot_path else 'No Screenshot' + + # 添加数据到结果列表 + results.append({ + 'Page Path': link, + 'Cookie File': cookie_hyperlink, + 'Screenshot': screenshot_hyperlink + }) + + # 生成主 Excel 文件 + generate_main_excel(results, output_file) + + # 关闭浏览器 + driver.quit() + + except Exception as e: + logger.error(f"Error in main process: {e}") + +# 设置入口参数 +if __name__ == '__main__': + start_url = 'https://www.capcut.com/tools/desktop-video-editor?utm_medium=sem&utm_source=googleadwords_int&pid=359289&af_c_id=21157337217&adset_id=162157605753&ad_id=697948663363&placement=&keyword_name=capcut&targetid=kwd-1406970026529&matchtype=e&gad_source=1&gclid=Cj0KCQiA4fi7BhC5ARIsAEV1Yib0W39PL05K1QxR6WhUe87uIu7P0JmCcRdVxIFSlCxwxCGtJw_0sZMaAiIMEALw_wcB' # 你要测试的页面 + output_file = 'output/cookies_comparison_with_links.xlsx' # 结果输出文件 + main(start_url, output_file) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..21df64c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +attrs==24.3.0 +certifi==2024.12.14 +et_xmlfile==2.0.0 +h11==0.14.0 +idna==3.10 +numpy==2.2.1 +openpyxl==3.1.5 +outcome==1.3.0.post0 +pandas==2.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +pytz==2024.2 +selenium==4.27.1 +six==1.17.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +trio==0.28.0 +trio-websocket==0.11.1 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.3.0 +websocket-client==1.8.0 +wsproto==1.2.0