#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import csv

# HTML文件路径
html_file_path = '/Users/v_sat/Desktop/中国和34个省级2023年高清行政地图 - 知乎.html'
output_csv_path = '/Users/v_sat/Documents/trae_projects/bingqi/data/map_data.csv'

# 读取HTML文件内容
with open(html_file_path, 'r', encoding='utf-8') as f:
    html_content = f.read()

# 创建地图数据列表
map_data = []

# 省份名称列表
provinces = [
    '北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江',
    '上海', '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北',
    '湖南', '广东', '广西', '海南', '重庆', '四川', '贵州', '云南', '西藏',
    '陕西', '甘肃', '青海', '宁夏', '新疆', '香港', '澳门', '台湾', '中国'
]

# 搜索所有figure标签中的图片和标题
pattern = r'<figure[^>]*>.*?<img[^>]*src=["\'](.*?)["\'].*?<figcaption>(.*?)</figcaption>.*?</figure>'
matches = re.finditer(pattern, html_content, re.DOTALL)

# 创建图片标题到URL的映射
image_map = {}
for match in matches:
    img_url = match.group(1)
    caption = match.group(2)
    image_map[caption] = img_url

# 查找每个省份对应的图片
for province in provinces:
    # 检查标题中是否包含省份名称
    for caption, img_url in image_map.items():
        if province in caption:
            map_data.append({'province': province, 'image_url': img_url})
            break

# 如果没有找到足够的图片，尝试其他模式
if len(map_data) < len(provinces):
    # 尝试匹配省份名称后面的图片
    for province in provinces:
        # 跳过已经找到的省份
        if any(item['province'] == province for item in map_data):
            continue
        
        # 匹配省份名称后面的图片
        pattern = f'{province}(?:省|市|自治区|特别行政区)?.*?<figure[^>]*>.*?<img[^>]*src=["\'](.*?)["\']'
        matches = re.finditer(pattern, html_content, re.DOTALL)
        
        for match in matches:
            img_url = match.group(1)
            map_data.append({'province': province, 'image_url': img_url})
            break

# 去重
unique_map_data = []
seen_provinces = set()
for item in map_data:
    if item['province'] not in seen_provinces:
        seen_provinces.add(item['province'])
        unique_map_data.append(item)

# 按省份名称排序
unique_map_data.sort(key=lambda x: x['province'])

# 写入CSV文件
with open(output_csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
    fieldnames = ['省', '地图图片地址']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for item in unique_map_data:
        writer.writerow({'省': item['province'], '地图图片地址': item['image_url']})

print(f"已成功生成地图数据CSV文件: {output_csv_path}")
print(f"共提取到 {len(unique_map_data)} 条地图数据")
