From 18b3ee16e3ed0ca7446cc31e5c846aac9ccc6d07 Mon Sep 17 00:00:00 2001 From: Qihang Zhang Date: Fri, 4 Apr 2025 16:48:59 +0800 Subject: [PATCH] init --- compare_colume.py | 53 +++++++++++++++++++++ extract_time.py | 32 +++++++++++++ sort_time.py | 61 ++++++++++++++++++++++++ split_sheet.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 264 insertions(+) create mode 100644 compare_colume.py create mode 100644 extract_time.py create mode 100644 sort_time.py create mode 100644 split_sheet.py diff --git a/compare_colume.py b/compare_colume.py new file mode 100644 index 0000000..fc8045e --- /dev/null +++ b/compare_colume.py @@ -0,0 +1,53 @@ +import pandas as pd + + +def compare_excel_columns(file_a, file_b, sheet_a=0, sheet_b=0): + """ + 对比两个Excel文件中指定列的数据是否完全相同 + + 参数: + file_a: 表A的文件路径 + file_b: 表B的文件路径 + sheet_a: 表A的工作表索引或名称(默认为第一个工作表) + sheet_b: 表B的工作表索引或名称(默认为第一个工作表) + """ + try: + # 读取Excel文件 + df_a = pd.read_excel(file_a, sheet_name=sheet_a, header=None) + df_b = pd.read_excel(file_b, sheet_name=sheet_b, header=None) + + # 提取F列从F4(即索引3行)开始的数据(注意: pandas列F是第5列,索引为4) + col_a = df_a.iloc[3:, 5].reset_index(drop=True) + # 提取G列从G4(即索引3行)开始的数据(注意: pandas列G是第6列,索引为6) + col_b = df_b.iloc[3:, 5].reset_index(drop=True) + + # 比较长度 + if len(col_a) != len(col_b): + print(f"数据长度不一致: 表A有{len(col_a)}行,表B有{len(col_b)}行") + return False + + # 比较内容 + comparison = col_a == col_b + if comparison.all(): + print("两列数据完全相同") + return True + else: + # 找出不同的行 + diff_indices = comparison[comparison == False].index + print(f"发现{len(diff_indices)}处不同:") + for idx in diff_indices: + print(f"行 {idx + 4}: 表A值='{col_a[idx]}', 表B值='{col_b[idx]}'") + return False + + except Exception as e: + print(f"发生错误: {str(e)}") + return False + + +# 使用示例 +if __name__ == "__main__": + file_a = "4_2/全国统计系统会议接送表 准1 - 副本(1).xls" # 替换为你的表A文件路径 + file_b = "4_2/(4.2)全国统计系统办公室工作会议参会人员报名表 - 副本(1).xls" # 替换为你的表B文件路径 + + result = compare_excel_columns(file_a, file_b) + print("对比结果:", "相同" if result else "不同") \ No newline at end of file diff --git a/extract_time.py b/extract_time.py new file mode 100644 index 0000000..2a082b2 --- /dev/null +++ b/extract_time.py @@ -0,0 +1,32 @@ +import xlrd +from openpyxl import Workbook +import re + +# 读取 .xls 文件 +input_file = "全国统计系统会议接送表.xls" +workbook = xlrd.open_workbook(input_file) +sheet = workbook.sheet_by_index(0) # 第一个工作表 + +# 创建新的 .xlsx 文件用于写入 +output_workbook = Workbook() +output_sheet = output_workbook.active + +# 复制原数据到新文件(保持格式) +for row in range(sheet.nrows): + for col in range(sheet.ncols): + output_sheet.cell(row=row + 1, column=col + 1, value=sheet.cell_value(row, col)) + +# 提取 F 列时间并写入 M 列 +for row in range(3, sheet.nrows): # 从第4行开始(索引从0开始) + cell_value = sheet.cell_value(row, 5) # F列是第6列(xlrd列索引从0开始) + if cell_value: + # 提取时间(匹配 14:25、16:35 等格式) + time_match = re.search(r'(\d{1,2}[::]\d{2})', str(cell_value)) + if time_match: + extracted_time = time_match.group(1).replace(':', ':') # 统一时间格式 + output_sheet.cell(row=row + 1, column=13, value=extracted_time) # M列是第13列 + +# 保存为新的 .xlsx 文件 +output_file = "全国统计系统会议接送表_更新.xlsx" +output_workbook.save(output_file) +print(f"处理完成,结果已保存至: {output_file}") \ No newline at end of file diff --git a/sort_time.py b/sort_time.py new file mode 100644 index 0000000..02f0ea6 --- /dev/null +++ b/sort_time.py @@ -0,0 +1,61 @@ +import xlrd +import xlwt +import re +from datetime import datetime + +# 读取 .xls 文件 +input_file = "全国统计系统会议接送表.xls" +workbook = xlrd.open_workbook(input_file) +sheet = workbook.sheet_by_index(0) + +# 创建新的 .xls 文件 +output_workbook = xlwt.Workbook() +output_sheet = output_workbook.add_sheet("Sheet1") + +# 复制表头 +for col in range(sheet.ncols): + output_sheet.write(0, col, sheet.cell_value(0, col)) + output_sheet.write(1, col, sheet.cell_value(1, col)) + output_sheet.write(2, col, sheet.cell_value(2, col)) + +# 添加新列标题"到达时间"到M列 +output_sheet.write(2, 12, "到达时间") + +# 准备数据行并提取时间 +data_rows = [] +for row in range(3, sheet.nrows): + row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)] + cell_value = sheet.cell_value(row, 5) # F列是第6列(xlrd列索引从0开始) + extracted_time = None + + if cell_value: + time_match = re.search(r'(\d{1,2}[::]\d{2})', str(cell_value)) + if time_match: + extracted_time = time_match.group(1).replace(':', ':') + # 转换为时间对象便于排序 + try: + time_obj = datetime.strptime(extracted_time, "%H:%M").time() + except ValueError: + time_obj = None + else: + time_obj = None + else: + time_obj = None + + row_data.append(extracted_time) # 添加提取的时间字符串到行数据 + data_rows.append((time_obj, row, row_data)) # 存储时间对象、原行号和行数据 + +# 排序:先按时间排序,没有时间的放在最后 +data_rows.sort(key=lambda x: (x[0] is not None, x[0] or datetime.min.time())) + +# 写入排序后的数据 +output_row = 3 +for time_obj, original_row, row_data in data_rows: + for col in range(len(row_data)): + output_sheet.write(output_row, col, row_data[col]) + output_row += 1 + +# 保存 +output_file = "全国统计系统会议接送表_排序.xls" +output_workbook.save(output_file) +print(f"处理完成,结果已保存至: {output_file}") \ No newline at end of file diff --git a/split_sheet.py b/split_sheet.py new file mode 100644 index 0000000..a712a55 --- /dev/null +++ b/split_sheet.py @@ -0,0 +1,118 @@ +import xlrd +import xlwt +import re +from datetime import datetime + + +def extract_time(cell_value): + """从单元格内容中提取时间""" + if cell_value: + time_match = re.search(r'(\d{1,2}[::]\d{2})', str(cell_value)) + if time_match: + return time_match.group(1).replace(':', ':') + return None + + +def is_airport(cell_value): + """判断是否是机场到达(包含T1/T2/T3/机场/航站楼等关键词)""" + if not cell_value: + return False + return any(keyword in str(cell_value) for keyword in ['T1', 'T2', 'T3', '机场', '航站楼']) + + +def is_train_station(cell_value): + """判断是否是火车站到达(包含站/高铁等关键词)""" + if not cell_value: + return False + return any(keyword in str(cell_value) for keyword in ['站', '高铁', 'G', '北站', '贵阳站']) + + +# 读取原始文件 +input_file = "4_2/全国统计系统会议接送表(1)(1).xls" +workbook = xlrd.open_workbook(input_file) +sheet = workbook.sheet_by_index(0) + +# 创建两个输出工作簿 +airplane_workbook = xlwt.Workbook() +train_workbook = xlwt.Workbook() + +# 创建工作表 +airplane_sheet = airplane_workbook.add_sheet("飞机到达表") +train_sheet = train_workbook.add_sheet("火车到达表") + +# 准备存储数据 +airplane_data = [] +train_data = [] + +# 处理表头 +headers = [sheet.cell_value(row, col) for row in range(3) for col in range(sheet.ncols)] +header_row_count = 3 # 原表有3行表头 + +# 复制表头到两个工作表 +for row in range(header_row_count): + for col in range(sheet.ncols): + airplane_sheet.write(row, col, sheet.cell_value(row, col)) + train_sheet.write(row, col, sheet.cell_value(row, col)) + + +# 处理数据行 +for row in range(header_row_count, sheet.nrows): + row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)] + cell_value = sheet.cell_value(row, 5) # F列 + extracted_time = extract_time(cell_value) + + # 转换为时间对象用于排序 + time_obj = None + if extracted_time: + try: + time_obj = datetime.strptime(extracted_time, "%H:%M").time() + except ValueError: + pass + + # 添加提取的时间到行数据 + row_data.append(extracted_time) + + # 根据到达方式分类 + if is_airport(cell_value): + airplane_data.append((time_obj, row_data)) + elif is_train_station(cell_value): + train_data.append((time_obj, row_data)) + else: + # 无法识别的到达方式,根据用户需求决定放在哪个表 + # 这里默认放入火车到达表 + train_data.append((time_obj, row_data)) + + +# 排序函数 +def sort_key(item): + time_obj, row_data = item + return (time_obj is not None, time_obj or datetime.min.time()) + + +# 对两个数据集分别排序 +airplane_data.sort(key=sort_key) +train_data.sort(key=sort_key) + +# 写入飞机到达表 +output_row = header_row_count +for time_obj, row_data in airplane_data: + for col in range(len(row_data)): + airplane_sheet.write(output_row, col, row_data[col]) + output_row += 1 + +# 写入火车到达表 +output_row = header_row_count +for time_obj, row_data in train_data: + for col in range(len(row_data)): + train_sheet.write(output_row, col, row_data[col]) + output_row += 1 + +# 保存文件 +airplane_file = "飞机到达表.xls" +train_file = "火车到达表.xls" + +airplane_workbook.save(airplane_file) +train_workbook.save(train_file) + +print(f"处理完成,飞机到达表已保存至: {airplane_file}") +print(f"处理完成,火车到达表已保存至: {train_file}") \ No newline at end of file