From 18b3ee16e3ed0ca7446cc31e5c846aac9ccc6d07 Mon Sep 17 00:00:00 2001
From: Qihang Zhang <zqh6200@qq.com>
Date: Fri, 4 Apr 2025 16:48:59 +0800
Subject: [PATCH] init

---
 compare_colume.py |  53 +++++++++++++++++++++
 extract_time.py   |  32 +++++++++++++
 sort_time.py      |  61 ++++++++++++++++++++++++
 split_sheet.py    | 118 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 264 insertions(+)
 create mode 100644 compare_colume.py
 create mode 100644 extract_time.py
 create mode 100644 sort_time.py
 create mode 100644 split_sheet.py

diff --git a/compare_colume.py b/compare_colume.py
new file mode 100644
index 0000000..fc8045e
--- /dev/null
+++ b/compare_colume.py
@@ -0,0 +1,53 @@
+import pandas as pd
+
+
+def compare_excel_columns(file_a, file_b, sheet_a=0, sheet_b=0):
+    """
+    对比两个Excel文件中指定列的数据是否完全相同
+
+    参数:
+    file_a: 表A的文件路径
+    file_b: 表B的文件路径
+    sheet_a: 表A的工作表索引或名称(默认为第一个工作表)
+    sheet_b: 表B的工作表索引或名称(默认为第一个工作表)
+    """
+    try:
+        # 读取Excel文件
+        df_a = pd.read_excel(file_a, sheet_name=sheet_a, header=None)
+        df_b = pd.read_excel(file_b, sheet_name=sheet_b, header=None)
+
+        # 提取F列从F4(即索引3行)开始的数据(注意: pandas列F是第5列，索引为4)
+        col_a = df_a.iloc[3:, 5].reset_index(drop=True)
+        # 提取G列从G4(即索引3行)开始的数据(注意: pandas列G是第6列，索引为6)
+        col_b = df_b.iloc[3:, 5].reset_index(drop=True)
+
+        # 比较长度
+        if len(col_a) != len(col_b):
+            print(f"数据长度不一致: 表A有{len(col_a)}行，表B有{len(col_b)}行")
+            return False
+
+        # 比较内容
+        comparison = col_a == col_b
+        if comparison.all():
+            print("两列数据完全相同")
+            return True
+        else:
+            # 找出不同的行
+            diff_indices = comparison[comparison == False].index
+            print(f"发现{len(diff_indices)}处不同:")
+            for idx in diff_indices:
+                print(f"行 {idx + 4}: 表A值='{col_a[idx]}', 表B值='{col_b[idx]}'")
+            return False
+
+    except Exception as e:
+        print(f"发生错误: {str(e)}")
+        return False
+
+
+# 使用示例
+if __name__ == "__main__":
+    file_a = "4_2/全国统计系统会议接送表 准1 - 副本(1).xls"  # 替换为你的表A文件路径
+    file_b = "4_2/（4.2）全国统计系统办公室工作会议参会人员报名表 - 副本(1).xls"  # 替换为你的表B文件路径
+
+    result = compare_excel_columns(file_a, file_b)
+    print("对比结果:", "相同" if result else "不同")
\ No newline at end of file
diff --git a/extract_time.py b/extract_time.py
new file mode 100644
index 0000000..2a082b2
--- /dev/null
+++ b/extract_time.py
@@ -0,0 +1,32 @@
+import xlrd
+from openpyxl import Workbook
+import re
+
+# 读取 .xls 文件
+input_file = "全国统计系统会议接送表.xls"
+workbook = xlrd.open_workbook(input_file)
+sheet = workbook.sheet_by_index(0)  # 第一个工作表
+
+# 创建新的 .xlsx 文件用于写入
+output_workbook = Workbook()
+output_sheet = output_workbook.active
+
+# 复制原数据到新文件（保持格式）
+for row in range(sheet.nrows):
+    for col in range(sheet.ncols):
+        output_sheet.cell(row=row + 1, column=col + 1, value=sheet.cell_value(row, col))
+
+# 提取 F 列时间并写入 M 列
+for row in range(3, sheet.nrows):  # 从第4行开始（索引从0开始）
+    cell_value = sheet.cell_value(row, 5)  # F列是第6列（xlrd列索引从0开始）
+    if cell_value:
+        # 提取时间（匹配 14:25、16：35 等格式）
+        time_match = re.search(r'(\d{1,2}[:：]\d{2})', str(cell_value))
+        if time_match:
+            extracted_time = time_match.group(1).replace('：', ':')  # 统一时间格式
+            output_sheet.cell(row=row + 1, column=13, value=extracted_time)  # M列是第13列
+
+# 保存为新的 .xlsx 文件
+output_file = "全国统计系统会议接送表_更新.xlsx"
+output_workbook.save(output_file)
+print(f"处理完成，结果已保存至: {output_file}")
\ No newline at end of file
diff --git a/sort_time.py b/sort_time.py
new file mode 100644
index 0000000..02f0ea6
--- /dev/null
+++ b/sort_time.py
@@ -0,0 +1,61 @@
+import xlrd
+import xlwt
+import re
+from datetime import datetime
+
+# 读取 .xls 文件
+input_file = "全国统计系统会议接送表.xls"
+workbook = xlrd.open_workbook(input_file)
+sheet = workbook.sheet_by_index(0)
+
+# 创建新的 .xls 文件
+output_workbook = xlwt.Workbook()
+output_sheet = output_workbook.add_sheet("Sheet1")
+
+# 复制表头
+for col in range(sheet.ncols):
+    output_sheet.write(0, col, sheet.cell_value(0, col))
+    output_sheet.write(1, col, sheet.cell_value(1, col))
+    output_sheet.write(2, col, sheet.cell_value(2, col))
+
+# 添加新列标题"到达时间"到M列
+output_sheet.write(2, 12, "到达时间")
+
+# 准备数据行并提取时间
+data_rows = []
+for row in range(3, sheet.nrows):
+    row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)]
+    cell_value = sheet.cell_value(row, 5)  # F列是第6列（xlrd列索引从0开始）
+    extracted_time = None
+
+    if cell_value:
+        time_match = re.search(r'(\d{1,2}[:：]\d{2})', str(cell_value))
+        if time_match:
+            extracted_time = time_match.group(1).replace('：', ':')
+            # 转换为时间对象便于排序
+            try:
+                time_obj = datetime.strptime(extracted_time, "%H:%M").time()
+            except ValueError:
+                time_obj = None
+        else:
+            time_obj = None
+    else:
+        time_obj = None
+
+    row_data.append(extracted_time)  # 添加提取的时间字符串到行数据
+    data_rows.append((time_obj, row, row_data))  # 存储时间对象、原行号和行数据
+
+# 排序：先按时间排序，没有时间的放在最后
+data_rows.sort(key=lambda x: (x[0] is not None, x[0] or datetime.min.time()))
+
+# 写入排序后的数据
+output_row = 3
+for time_obj, original_row, row_data in data_rows:
+    for col in range(len(row_data)):
+        output_sheet.write(output_row, col, row_data[col])
+    output_row += 1
+
+# 保存
+output_file = "全国统计系统会议接送表_排序.xls"
+output_workbook.save(output_file)
+print(f"处理完成，结果已保存至: {output_file}")
\ No newline at end of file
diff --git a/split_sheet.py b/split_sheet.py
new file mode 100644
index 0000000..a712a55
--- /dev/null
+++ b/split_sheet.py
@@ -0,0 +1,118 @@
+import xlrd
+import xlwt
+import re
+from datetime import datetime
+
+
+def extract_time(cell_value):
+    """从单元格内容中提取时间"""
+    if cell_value:
+        time_match = re.search(r'(\d{1,2}[:：]\d{2})', str(cell_value))
+        if time_match:
+            return time_match.group(1).replace('：', ':')
+    return None
+
+
+def is_airport(cell_value):
+    """判断是否是机场到达（包含T1/T2/T3/机场/航站楼等关键词）"""
+    if not cell_value:
+        return False
+    return any(keyword in str(cell_value) for keyword in ['T1', 'T2', 'T3', '机场', '航站楼'])
+
+
+def is_train_station(cell_value):
+    """判断是否是火车站到达（包含站/高铁等关键词）"""
+    if not cell_value:
+        return False
+    return any(keyword in str(cell_value) for keyword in ['站', '高铁', 'G', '北站', '贵阳站'])
+
+
+# 读取原始文件
+input_file = "4_2/全国统计系统会议接送表(1)(1).xls"
+workbook = xlrd.open_workbook(input_file)
+sheet = workbook.sheet_by_index(0)
+
+# 创建两个输出工作簿
+airplane_workbook = xlwt.Workbook()
+train_workbook = xlwt.Workbook()
+
+# 创建工作表
+airplane_sheet = airplane_workbook.add_sheet("飞机到达表")
+train_sheet = train_workbook.add_sheet("火车到达表")
+
+# 准备存储数据
+airplane_data = []
+train_data = []
+
+# 处理表头
+headers = [sheet.cell_value(row, col) for row in range(3) for col in range(sheet.ncols)]
+header_row_count = 3  # 原表有3行表头
+
+# 复制表头到两个工作表
+for row in range(header_row_count):
+    for col in range(sheet.ncols):
+        airplane_sheet.write(row, col, sheet.cell_value(row, col))
+        train_sheet.write(row, col, sheet.cell_value(row, col))
+
+
+# 处理数据行
+for row in range(header_row_count, sheet.nrows):
+    row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)]
+    cell_value = sheet.cell_value(row, 5)  # F列
+    extracted_time = extract_time(cell_value)
+
+    # 转换为时间对象用于排序
+    time_obj = None
+    if extracted_time:
+        try:
+            time_obj = datetime.strptime(extracted_time, "%H:%M").time()
+        except ValueError:
+            pass
+
+    # 添加提取的时间到行数据
+    row_data.append(extracted_time)
+
+    # 根据到达方式分类
+    if is_airport(cell_value):
+        airplane_data.append((time_obj, row_data))
+    elif is_train_station(cell_value):
+        train_data.append((time_obj, row_data))
+    else:
+        # 无法识别的到达方式，根据用户需求决定放在哪个表
+        # 这里默认放入火车到达表
+        train_data.append((time_obj, row_data))
+
+
+# 排序函数
+def sort_key(item):
+    time_obj, row_data = item
+    return (time_obj is not None, time_obj or datetime.min.time())
+
+
+# 对两个数据集分别排序
+airplane_data.sort(key=sort_key)
+train_data.sort(key=sort_key)
+
+# 写入飞机到达表
+output_row = header_row_count
+for time_obj, row_data in airplane_data:
+    for col in range(len(row_data)):
+        airplane_sheet.write(output_row, col, row_data[col])
+    output_row += 1
+
+# 写入火车到达表
+output_row = header_row_count
+for time_obj, row_data in train_data:
+    for col in range(len(row_data)):
+        train_sheet.write(output_row, col, row_data[col])
+    output_row += 1
+
+# 保存文件
+airplane_file = "飞机到达表.xls"
+train_file = "火车到达表.xls"
+
+airplane_workbook.save(airplane_file)
+train_workbook.save(train_file)
+
+print(f"处理完成，飞机到达表已保存至: {airplane_file}")
+print(f"处理完成，火车到达表已保存至: {train_file}")
\ No newline at end of file