cookie-find/main.py

import time
import pandas as pd
import logging
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlparse
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import uuid
from datetime import datetime
import shutil

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 配置 Chrome 无头模式
options = Options()
options.add_argument('--headless')  # 无头模式
service = Service(executable_path='/opt/homebrew/bin/chromedriver')

# 获取浏览器驱动
def get_driver():
    return webdriver.Chrome(service=service, options=options)

def format_expiry_time(expiry):
    if expiry:
        try:
            # 转换为标准时间格式
            return datetime.utcfromtimestamp(expiry).strftime('%Y-%m-%d %H:%M:%S')
        except Exception as e:
            logger.error(f"Error formatting expiry time: {e}")
            return 'Invalid Time'
    return 'N/A'

# 获取页面的所有 a 标签和 Cookie
def get_page_data(driver, url, output_dir):
    try:
        logger.info(f"Fetching page: {url}")
        driver.get(url)
        time.sleep(2)  # 等待页面加载

        # 获取页面的 Cookie
        cookies = driver.get_cookies()

        # 创建图片保存路径
        image_dir = os.path.join(output_dir, 'images')
        os.makedirs(image_dir, exist_ok=True)

        # 截图保存
        screenshot_filename = f"{uuid.uuid4()}.png"  # 使用唯一的UUID作为文件名
        screenshot_path = os.path.join(image_dir, screenshot_filename)
        driver.save_screenshot(screenshot_path)
        logger.info(f"Screenshot saved: {screenshot_path}")

        # 获取页面的所有 a 标签的 href
        links = driver.find_elements(By.TAG_NAME, 'a')
        hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href')]

        logger.info(f"Found {len(hrefs)} links on the page.")
        return hrefs, cookies, os.path.relpath(screenshot_path, output_dir)  # 返回相对路径
    except Exception as e:
        logger.error(f"Error fetching page data: {e}")
        return [], [], None

# 生成单独的 Cookie 文件并返回其路径
def generate_cookie_file(cookies, base_url, output_dir):
    try:
        # 创建 Cookie 文件保存路径
        cookie_dir = os.path.join(output_dir, 'cookie')
        os.makedirs(cookie_dir, exist_ok=True)

        # 创建 Cookie DataFrame
        cookie_data = [{'Name': cookie['name'], 'Domain': cookie['domain'], 'Expires': format_expiry_time(cookie.get('expiry'))} for cookie in cookies]
        cookie_df = pd.DataFrame(cookie_data)

        # 生成文件名并保存 Cookie 文件
        unique_filename = f"{uuid.uuid4()}_cookies.xlsx"
        file_path = os.path.join(cookie_dir, unique_filename)
        cookie_df.to_excel(file_path, index=False, engine='openpyxl')

        logger.info(f"Cookie file saved: {file_path}")
        return os.path.relpath(file_path, output_dir)  # 返回相对路径
    except Exception as e:
        logger.error(f"Error generating cookie file: {e}")
        return None

# 过滤掉与指定域名不一致的链接
def filter_links(hrefs, base_url):
    base_domain = urlparse(base_url).netloc
    filtered_links = set()  # 使用 set 来去重

    for href in hrefs:
        if urlparse(href).netloc == base_domain:
            filtered_links.add(href)  # 使用 add() 自动去重

    logger.info(f"Filtered {len(filtered_links)} links matching domain.")
    return list(filtered_links)

# 生成主 Excel 文件
def generate_main_excel(data, output_file):
    try:
        df = pd.DataFrame(data)

        # 保存为 Excel 文件
        df.to_excel(output_file, index=False, engine='openpyxl')
        logger.info(f"Main Excel file generated: {output_file}")
    except Exception as e:
        logger.error(f"Error generating main Excel file: {e}")

# 主程序
def main(start_url, output_file):
    try:
        logger.info(f"Starting main process with start URL: {start_url}")

        # 获取浏览器驱动
        driver = get_driver()

        # 确保输出目录存在
        output_dir = os.path.dirname(os.path.abspath(output_file))
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)

        os.makedirs(output_dir, exist_ok=True)

        # 获取主页面的链接和 Cookie
        hrefs, cookies, screenshot_path = get_page_data(driver, start_url, output_dir)

        # 过滤链接并去重
        filtered_links = filter_links(hrefs, start_url)
        filtered_links = filtered_links[:3]

        results = []

        # 遍历过滤后的链接，获取每个链接的 Cookie
        for link in filtered_links:
            logger.info(f"Processing {link}...")
            hrefs, cookies, screenshot_path = get_page_data(driver, link, output_dir)

            # 生成 Cookie 文件
            cookie_file_path = generate_cookie_file(cookies, link, output_dir)

            # 创建超链接
            cookie_hyperlink = f'=HYPERLINK("{cookie_file_path}", "查看")' if cookie_file_path else 'No Cookie File'
            screenshot_hyperlink = f'=HYPERLINK("{screenshot_path}", "查看截图")' if screenshot_path else 'No Screenshot'

            # 添加数据到结果列表
            results.append({
                'Page Path': link,
                'Cookie File': cookie_hyperlink,
                'Screenshot': screenshot_hyperlink
            })

        # 生成主 Excel 文件
        generate_main_excel(results, output_file)

        # 关闭浏览器
        driver.quit()

    except Exception as e:
        logger.error(f"Error in main process: {e}")

# 设置入口参数
if __name__ == '__main__':
    start_url = 'https://www.capcut.com/tools/desktop-video-editor?utm_medium=sem&utm_source=googleadwords_int&pid=359289&af_c_id=21157337217&adset_id=162157605753&ad_id=697948663363&placement=&keyword_name=capcut&targetid=kwd-1406970026529&matchtype=e&gad_source=1&gclid=Cj0KCQiA4fi7BhC5ARIsAEV1Yib0W39PL05K1QxR6WhUe87uIu7P0JmCcRdVxIFSlCxwxCGtJw_0sZMaAiIMEALw_wcB'  # 你要测试的页面
    output_file = 'output/cookies_comparison_with_links.xlsx'  # 结果输出文件
    main(start_url, output_file)