170 lines
6.3 KiB
Python
170 lines
6.3 KiB
Python
import time
|
|
import pandas as pd
|
|
import logging
|
|
import os
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from urllib.parse import urlparse
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
import uuid
|
|
from datetime import datetime
|
|
import shutil
|
|
|
|
# 配置日志
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 配置 Chrome 无头模式
|
|
options = Options()
|
|
options.add_argument('--headless') # 无头模式
|
|
service = Service(executable_path='/opt/homebrew/bin/chromedriver')
|
|
|
|
# 获取浏览器驱动
|
|
def get_driver():
|
|
return webdriver.Chrome(service=service, options=options)
|
|
|
|
def format_expiry_time(expiry):
|
|
if expiry:
|
|
try:
|
|
# 转换为标准时间格式
|
|
return datetime.utcfromtimestamp(expiry).strftime('%Y-%m-%d %H:%M:%S')
|
|
except Exception as e:
|
|
logger.error(f"Error formatting expiry time: {e}")
|
|
return 'Invalid Time'
|
|
return 'N/A'
|
|
|
|
# 获取页面的所有 a 标签和 Cookie
|
|
def get_page_data(driver, url, output_dir):
|
|
try:
|
|
logger.info(f"Fetching page: {url}")
|
|
driver.get(url)
|
|
time.sleep(2) # 等待页面加载
|
|
|
|
# 获取页面的 Cookie
|
|
cookies = driver.get_cookies()
|
|
|
|
# 创建图片保存路径
|
|
image_dir = os.path.join(output_dir, 'images')
|
|
os.makedirs(image_dir, exist_ok=True)
|
|
|
|
# 截图保存
|
|
screenshot_filename = f"{uuid.uuid4()}.png" # 使用唯一的UUID作为文件名
|
|
screenshot_path = os.path.join(image_dir, screenshot_filename)
|
|
driver.save_screenshot(screenshot_path)
|
|
logger.info(f"Screenshot saved: {screenshot_path}")
|
|
|
|
# 获取页面的所有 a 标签的 href
|
|
links = driver.find_elements(By.TAG_NAME, 'a')
|
|
hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href')]
|
|
|
|
logger.info(f"Found {len(hrefs)} links on the page.")
|
|
return hrefs, cookies, os.path.relpath(screenshot_path, output_dir) # 返回相对路径
|
|
except Exception as e:
|
|
logger.error(f"Error fetching page data: {e}")
|
|
return [], [], None
|
|
|
|
# 生成单独的 Cookie 文件并返回其路径
|
|
def generate_cookie_file(cookies, base_url, output_dir):
|
|
try:
|
|
# 创建 Cookie 文件保存路径
|
|
cookie_dir = os.path.join(output_dir, 'cookie')
|
|
os.makedirs(cookie_dir, exist_ok=True)
|
|
|
|
# 创建 Cookie DataFrame
|
|
cookie_data = [{'Name': cookie['name'], 'Domain': cookie['domain'], 'Expires': format_expiry_time(cookie.get('expiry'))} for cookie in cookies]
|
|
cookie_df = pd.DataFrame(cookie_data)
|
|
|
|
# 生成文件名并保存 Cookie 文件
|
|
unique_filename = f"{uuid.uuid4()}_cookies.xlsx"
|
|
file_path = os.path.join(cookie_dir, unique_filename)
|
|
cookie_df.to_excel(file_path, index=False, engine='openpyxl')
|
|
|
|
logger.info(f"Cookie file saved: {file_path}")
|
|
return os.path.relpath(file_path, output_dir) # 返回相对路径
|
|
except Exception as e:
|
|
logger.error(f"Error generating cookie file: {e}")
|
|
return None
|
|
|
|
# 过滤掉与指定域名不一致的链接
|
|
def filter_links(hrefs, base_url):
|
|
base_domain = urlparse(base_url).netloc
|
|
filtered_links = set() # 使用 set 来去重
|
|
|
|
for href in hrefs:
|
|
if urlparse(href).netloc == base_domain:
|
|
filtered_links.add(href) # 使用 add() 自动去重
|
|
|
|
logger.info(f"Filtered {len(filtered_links)} links matching domain.")
|
|
return list(filtered_links)
|
|
|
|
# 生成主 Excel 文件
|
|
def generate_main_excel(data, output_file):
|
|
try:
|
|
df = pd.DataFrame(data)
|
|
|
|
# 保存为 Excel 文件
|
|
df.to_excel(output_file, index=False, engine='openpyxl')
|
|
logger.info(f"Main Excel file generated: {output_file}")
|
|
except Exception as e:
|
|
logger.error(f"Error generating main Excel file: {e}")
|
|
|
|
# 主程序
|
|
def main(start_url, output_file):
|
|
try:
|
|
logger.info(f"Starting main process with start URL: {start_url}")
|
|
|
|
# 获取浏览器驱动
|
|
driver = get_driver()
|
|
|
|
# 确保输出目录存在
|
|
output_dir = os.path.dirname(os.path.abspath(output_file))
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# 获取主页面的链接和 Cookie
|
|
hrefs, cookies, screenshot_path = get_page_data(driver, start_url, output_dir)
|
|
|
|
# 过滤链接并去重
|
|
filtered_links = filter_links(hrefs, start_url)
|
|
filtered_links = filtered_links[:3]
|
|
|
|
results = []
|
|
|
|
# 遍历过滤后的链接,获取每个链接的 Cookie
|
|
for link in filtered_links:
|
|
logger.info(f"Processing {link}...")
|
|
hrefs, cookies, screenshot_path = get_page_data(driver, link, output_dir)
|
|
|
|
# 生成 Cookie 文件
|
|
cookie_file_path = generate_cookie_file(cookies, link, output_dir)
|
|
|
|
# 创建超链接
|
|
cookie_hyperlink = f'=HYPERLINK("{cookie_file_path}", "查看")' if cookie_file_path else 'No Cookie File'
|
|
screenshot_hyperlink = f'=HYPERLINK("{screenshot_path}", "查看截图")' if screenshot_path else 'No Screenshot'
|
|
|
|
# 添加数据到结果列表
|
|
results.append({
|
|
'Page Path': link,
|
|
'Cookie File': cookie_hyperlink,
|
|
'Screenshot': screenshot_hyperlink
|
|
})
|
|
|
|
# 生成主 Excel 文件
|
|
generate_main_excel(results, output_file)
|
|
|
|
# 关闭浏览器
|
|
driver.quit()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in main process: {e}")
|
|
|
|
# 设置入口参数
|
|
if __name__ == '__main__':
|
|
start_url = 'https://www.capcut.com/tools/desktop-video-editor?utm_medium=sem&utm_source=googleadwords_int&pid=359289&af_c_id=21157337217&adset_id=162157605753&ad_id=697948663363&placement=&keyword_name=capcut&targetid=kwd-1406970026529&matchtype=e&gad_source=1&gclid=Cj0KCQiA4fi7BhC5ARIsAEV1Yib0W39PL05K1QxR6WhUe87uIu7P0JmCcRdVxIFSlCxwxCGtJw_0sZMaAiIMEALw_wcB' # 你要测试的页面
|
|
output_file = 'output/cookies_comparison_with_links.xlsx' # 结果输出文件
|
|
main(start_url, output_file)
|