init

2025-04-04 16:48:59 +08:00 · 2025-04-04 16:48:59 +08:00 · 18b3ee16e3
commit 18b3ee16e3
4 changed files with 264 additions and 0 deletions
--- a/compare_colume.py
+++ b/compare_colume.py
@ -0,0 +1,53 @@
 import pandas as pd
 def compare_excel_columns(file_a, file_b, sheet_a=0, sheet_b=0):
    """
    对比两个Excel文件中指定列的数据是否完全相同
    参数:
    file_a: 表A的文件路径
    file_b: 表B的文件路径
    sheet_a: 表A的工作表索引或名称(默认为第一个工作表)
    sheet_b: 表B的工作表索引或名称(默认为第一个工作表)
    """
    try:
        # 读取Excel文件
        df_a = pd.read_excel(file_a, sheet_name=sheet_a, header=None)
        df_b = pd.read_excel(file_b, sheet_name=sheet_b, header=None)
        # 提取F列从F4(即索引3行)开始的数据(注意: pandas列F是第5列，索引为4)
        col_a = df_a.iloc[3:, 5].reset_index(drop=True)
        # 提取G列从G4(即索引3行)开始的数据(注意: pandas列G是第6列，索引为6)
        col_b = df_b.iloc[3:, 5].reset_index(drop=True)
        # 比较长度
        if len(col_a) != len(col_b):
            print(f"数据长度不一致: 表A有{len(col_a)}行，表B有{len(col_b)}行")
            return False
        # 比较内容
        comparison = col_a == col_b
        if comparison.all():
            print("两列数据完全相同")
            return True
        else:
            # 找出不同的行
            diff_indices = comparison[comparison == False].index
            print(f"发现{len(diff_indices)}处不同:")
            for idx in diff_indices:
                print(f"行 {idx + 4}: 表A值='{col_a[idx]}', 表B值='{col_b[idx]}'")
            return False
    except Exception as e:
        print(f"发生错误: {str(e)}")
        return False
 # 使用示例
 if __name__ == "__main__":
    file_a = "4_2/全国统计系统会议接送表 准1 - 副本(1).xls"  # 替换为你的表A文件路径
    file_b = "4_2/（4.2）全国统计系统办公室工作会议参会人员报名表 - 副本(1).xls"  # 替换为你的表B文件路径
    result = compare_excel_columns(file_a, file_b)
    print("对比结果:", "相同" if result else "不同")
--- a/extract_time.py
+++ b/extract_time.py
@ -0,0 +1,32 @@
 import xlrd
 from openpyxl import Workbook
 import re
 # 读取 .xls 文件
 input_file = "全国统计系统会议接送表.xls"
 workbook = xlrd.open_workbook(input_file)
 sheet = workbook.sheet_by_index(0)  # 第一个工作表
 # 创建新的 .xlsx 文件用于写入
 output_workbook = Workbook()
 output_sheet = output_workbook.active
 # 复制原数据到新文件（保持格式）
 for row in range(sheet.nrows):
    for col in range(sheet.ncols):
        output_sheet.cell(row=row + 1, column=col + 1, value=sheet.cell_value(row, col))
 # 提取 F 列时间并写入 M 列
 for row in range(3, sheet.nrows):  # 从第4行开始（索引从0开始）
    cell_value = sheet.cell_value(row, 5)  # F列是第6列（xlrd列索引从0开始）
    if cell_value:
        # 提取时间（匹配 14:25、16：35 等格式）
        time_match = re.search(r'(\d{1,2}[:：]\d{2})', str(cell_value))
        if time_match:
            extracted_time = time_match.group(1).replace('：', ':')  # 统一时间格式
            output_sheet.cell(row=row + 1, column=13, value=extracted_time)  # M列是第13列
 # 保存为新的 .xlsx 文件
 output_file = "全国统计系统会议接送表_更新.xlsx"
 output_workbook.save(output_file)
 print(f"处理完成，结果已保存至: {output_file}")
--- a/sort_time.py
+++ b/sort_time.py
@ -0,0 +1,61 @@
 import xlrd
 import xlwt
 import re
 from datetime import datetime
 # 读取 .xls 文件
 input_file = "全国统计系统会议接送表.xls"
 workbook = xlrd.open_workbook(input_file)
 sheet = workbook.sheet_by_index(0)
 # 创建新的 .xls 文件
 output_workbook = xlwt.Workbook()
 output_sheet = output_workbook.add_sheet("Sheet1")
 # 复制表头
 for col in range(sheet.ncols):
    output_sheet.write(0, col, sheet.cell_value(0, col))
    output_sheet.write(1, col, sheet.cell_value(1, col))
    output_sheet.write(2, col, sheet.cell_value(2, col))
 # 添加新列标题"到达时间"到M列
 output_sheet.write(2, 12, "到达时间")
 # 准备数据行并提取时间
 data_rows = []
 for row in range(3, sheet.nrows):
    row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)]
    cell_value = sheet.cell_value(row, 5)  # F列是第6列（xlrd列索引从0开始）
    extracted_time = None
    if cell_value:
        time_match = re.search(r'(\d{1,2}[:：]\d{2})', str(cell_value))
        if time_match:
            extracted_time = time_match.group(1).replace('：', ':')
            # 转换为时间对象便于排序
            try:
                time_obj = datetime.strptime(extracted_time, "%H:%M").time()
            except ValueError:
                time_obj = None
        else:
            time_obj = None
    else:
        time_obj = None
    row_data.append(extracted_time)  # 添加提取的时间字符串到行数据
    data_rows.append((time_obj, row, row_data))  # 存储时间对象、原行号和行数据
 # 排序：先按时间排序，没有时间的放在最后
 data_rows.sort(key=lambda x: (x[0] is not None, x[0] or datetime.min.time()))
 # 写入排序后的数据
 output_row = 3
 for time_obj, original_row, row_data in data_rows:
    for col in range(len(row_data)):
        output_sheet.write(output_row, col, row_data[col])
    output_row += 1
 # 保存
 output_file = "全国统计系统会议接送表_排序.xls"
 output_workbook.save(output_file)
 print(f"处理完成，结果已保存至: {output_file}")
--- a/split_sheet.py
+++ b/split_sheet.py
@ -0,0 +1,118 @@
 import xlrd
 import xlwt
 import re
 from datetime import datetime
 def extract_time(cell_value):
    """从单元格内容中提取时间"""
    if cell_value:
        time_match = re.search(r'(\d{1,2}[:：]\d{2})', str(cell_value))
        if time_match:
            return time_match.group(1).replace('：', ':')
    return None
 def is_airport(cell_value):
    """判断是否是机场到达（包含T1/T2/T3/机场/航站楼等关键词）"""
    if not cell_value:
        return False
    return any(keyword in str(cell_value) for keyword in ['T1', 'T2', 'T3', '机场', '航站楼'])
 def is_train_station(cell_value):
    """判断是否是火车站到达（包含站/高铁等关键词）"""
    if not cell_value:
        return False
    return any(keyword in str(cell_value) for keyword in ['站', '高铁', 'G', '北站', '贵阳站'])
 # 读取原始文件
 input_file = "4_2/全国统计系统会议接送表(1)(1).xls"
 workbook = xlrd.open_workbook(input_file)
 sheet = workbook.sheet_by_index(0)
 # 创建两个输出工作簿
 airplane_workbook = xlwt.Workbook()
 train_workbook = xlwt.Workbook()
 # 创建工作表
 airplane_sheet = airplane_workbook.add_sheet("飞机到达表")
 train_sheet = train_workbook.add_sheet("火车到达表")
 # 准备存储数据
 airplane_data = []
 train_data = []
 # 处理表头
 headers = [sheet.cell_value(row, col) for row in range(3) for col in range(sheet.ncols)]
 header_row_count = 3  # 原表有3行表头
 # 复制表头到两个工作表
 for row in range(header_row_count):
    for col in range(sheet.ncols):
        airplane_sheet.write(row, col, sheet.cell_value(row, col))
        train_sheet.write(row, col, sheet.cell_value(row, col))
 # 处理数据行
 for row in range(header_row_count, sheet.nrows):
    row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)]
    cell_value = sheet.cell_value(row, 5)  # F列
    extracted_time = extract_time(cell_value)
    # 转换为时间对象用于排序
    time_obj = None
    if extracted_time:
        try:
            time_obj = datetime.strptime(extracted_time, "%H:%M").time()
        except ValueError:
            pass
    # 添加提取的时间到行数据
    row_data.append(extracted_time)
    # 根据到达方式分类
    if is_airport(cell_value):
        airplane_data.append((time_obj, row_data))
    elif is_train_station(cell_value):
        train_data.append((time_obj, row_data))
    else:
        # 无法识别的到达方式，根据用户需求决定放在哪个表
        # 这里默认放入火车到达表
        train_data.append((time_obj, row_data))
 # 排序函数
 def sort_key(item):
    time_obj, row_data = item
    return (time_obj is not None, time_obj or datetime.min.time())
 # 对两个数据集分别排序
 airplane_data.sort(key=sort_key)
 train_data.sort(key=sort_key)
 # 写入飞机到达表
 output_row = header_row_count
 for time_obj, row_data in airplane_data:
    for col in range(len(row_data)):
        airplane_sheet.write(output_row, col, row_data[col])
    output_row += 1
 # 写入火车到达表
 output_row = header_row_count
 for time_obj, row_data in train_data:
    for col in range(len(row_data)):
        train_sheet.write(output_row, col, row_data[col])
    output_row += 1
 # 保存文件
 airplane_file = "飞机到达表.xls"
 train_file = "火车到达表.xls"
 airplane_workbook.save(airplane_file)
 train_workbook.save(train_file)
 print(f"处理完成，飞机到达表已保存至: {airplane_file}")
 print(f"处理完成，火车到达表已保存至: {train_file}")