commit 4eb1564b91abdd98447de04d84f864c9fb807f66
Author: Owen <linkto.zc@gmail.com>
Date:   Thu Jan 16 21:15:36 2025 +0800

    feat: 初次提交

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..cfd17af
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b7754f8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+venv/
+output/
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c66ea81
--- /dev/null
+++ b/README.md
@@ -0,0 +1,4 @@
+### 创建虚拟环境
+python3 -m venv venv source venv/bin/activate # macOS/Linux venv\Scripts\activate # Windows
+### 安装依赖
+pip install -r requirements.txt
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e0adb89
--- /dev/null
+++ b/main.py
@@ -0,0 +1,169 @@
+import time
+import pandas as pd
+import logging
+import os
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from urllib.parse import urlparse
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.action_chains import ActionChains
+import uuid
+from datetime import datetime
+import shutil
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# 配置 Chrome 无头模式
+options = Options()
+options.add_argument('--headless')  # 无头模式
+service = Service(executable_path='/opt/homebrew/bin/chromedriver')
+
+# 获取浏览器驱动
+def get_driver():
+    return webdriver.Chrome(service=service, options=options)
+
+def format_expiry_time(expiry):
+    if expiry:
+        try:
+            # 转换为标准时间格式
+            return datetime.utcfromtimestamp(expiry).strftime('%Y-%m-%d %H:%M:%S')
+        except Exception as e:
+            logger.error(f"Error formatting expiry time: {e}")
+            return 'Invalid Time'
+    return 'N/A'
+
+# 获取页面的所有 a 标签和 Cookie
+def get_page_data(driver, url, output_dir):
+    try:
+        logger.info(f"Fetching page: {url}")
+        driver.get(url)
+        time.sleep(2)  # 等待页面加载
+
+        # 获取页面的 Cookie
+        cookies = driver.get_cookies()
+
+        # 创建图片保存路径
+        image_dir = os.path.join(output_dir, 'images')
+        os.makedirs(image_dir, exist_ok=True)
+        
+        # 截图保存
+        screenshot_filename = f"{uuid.uuid4()}.png"  # 使用唯一的UUID作为文件名
+        screenshot_path = os.path.join(image_dir, screenshot_filename)
+        driver.save_screenshot(screenshot_path)
+        logger.info(f"Screenshot saved: {screenshot_path}")
+
+        # 获取页面的所有 a 标签的 href
+        links = driver.find_elements(By.TAG_NAME, 'a')
+        hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href')]
+
+        logger.info(f"Found {len(hrefs)} links on the page.")
+        return hrefs, cookies, os.path.relpath(screenshot_path, output_dir)  # 返回相对路径
+    except Exception as e:
+        logger.error(f"Error fetching page data: {e}")
+        return [], [], None
+
+# 生成单独的 Cookie 文件并返回其路径
+def generate_cookie_file(cookies, base_url, output_dir):
+    try:
+        # 创建 Cookie 文件保存路径
+        cookie_dir = os.path.join(output_dir, 'cookie')
+        os.makedirs(cookie_dir, exist_ok=True)
+
+        # 创建 Cookie DataFrame
+        cookie_data = [{'Name': cookie['name'], 'Domain': cookie['domain'], 'Expires': format_expiry_time(cookie.get('expiry'))} for cookie in cookies]
+        cookie_df = pd.DataFrame(cookie_data)
+        
+        # 生成文件名并保存 Cookie 文件
+        unique_filename = f"{uuid.uuid4()}_cookies.xlsx"
+        file_path = os.path.join(cookie_dir, unique_filename)
+        cookie_df.to_excel(file_path, index=False, engine='openpyxl')
+        
+        logger.info(f"Cookie file saved: {file_path}")
+        return os.path.relpath(file_path, output_dir)  # 返回相对路径
+    except Exception as e:
+        logger.error(f"Error generating cookie file: {e}")
+        return None
+
+# 过滤掉与指定域名不一致的链接
+def filter_links(hrefs, base_url):
+    base_domain = urlparse(base_url).netloc
+    filtered_links = set()  # 使用 set 来去重
+
+    for href in hrefs:
+        if urlparse(href).netloc == base_domain:
+            filtered_links.add(href)  # 使用 add() 自动去重
+
+    logger.info(f"Filtered {len(filtered_links)} links matching domain.")
+    return list(filtered_links)
+
+# 生成主 Excel 文件
+def generate_main_excel(data, output_file):
+    try:
+        df = pd.DataFrame(data)
+
+        # 保存为 Excel 文件
+        df.to_excel(output_file, index=False, engine='openpyxl')
+        logger.info(f"Main Excel file generated: {output_file}")
+    except Exception as e:
+        logger.error(f"Error generating main Excel file: {e}")
+
+# 主程序
+def main(start_url, output_file):
+    try:
+        logger.info(f"Starting main process with start URL: {start_url}")
+
+        # 获取浏览器驱动
+        driver = get_driver()
+
+        # 确保输出目录存在
+        output_dir = os.path.dirname(os.path.abspath(output_file))
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+        
+        os.makedirs(output_dir, exist_ok=True)
+
+        # 获取主页面的链接和 Cookie
+        hrefs, cookies, screenshot_path = get_page_data(driver, start_url, output_dir)
+
+        # 过滤链接并去重
+        filtered_links = filter_links(hrefs, start_url)
+        filtered_links = filtered_links[:3]
+
+        results = []
+
+        # 遍历过滤后的链接，获取每个链接的 Cookie
+        for link in filtered_links:
+            logger.info(f"Processing {link}...")
+            hrefs, cookies, screenshot_path = get_page_data(driver, link, output_dir)
+
+            # 生成 Cookie 文件
+            cookie_file_path = generate_cookie_file(cookies, link, output_dir)
+
+            # 创建超链接
+            cookie_hyperlink = f'=HYPERLINK("{cookie_file_path}", "查看")' if cookie_file_path else 'No Cookie File'
+            screenshot_hyperlink = f'=HYPERLINK("{screenshot_path}", "查看截图")' if screenshot_path else 'No Screenshot'
+
+            # 添加数据到结果列表
+            results.append({
+                'Page Path': link,
+                'Cookie File': cookie_hyperlink,
+                'Screenshot': screenshot_hyperlink
+            })
+
+        # 生成主 Excel 文件
+        generate_main_excel(results, output_file)
+
+        # 关闭浏览器
+        driver.quit()
+
+    except Exception as e:
+        logger.error(f"Error in main process: {e}")
+
+# 设置入口参数
+if __name__ == '__main__':
+    start_url = 'https://www.capcut.com/tools/desktop-video-editor?utm_medium=sem&utm_source=googleadwords_int&pid=359289&af_c_id=21157337217&adset_id=162157605753&ad_id=697948663363&placement=&keyword_name=capcut&targetid=kwd-1406970026529&matchtype=e&gad_source=1&gclid=Cj0KCQiA4fi7BhC5ARIsAEV1Yib0W39PL05K1QxR6WhUe87uIu7P0JmCcRdVxIFSlCxwxCGtJw_0sZMaAiIMEALw_wcB'  # 你要测试的页面
+    output_file = 'output/cookies_comparison_with_links.xlsx'  # 结果输出文件
+    main(start_url, output_file)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..21df64c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+attrs==24.3.0
+certifi==2024.12.14
+et_xmlfile==2.0.0
+h11==0.14.0
+idna==3.10
+numpy==2.2.1
+openpyxl==3.1.5
+outcome==1.3.0.post0
+pandas==2.2.3
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+pytz==2024.2
+selenium==4.27.1
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+trio==0.28.0
+trio-websocket==0.11.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+websocket-client==1.8.0
+wsproto==1.2.0