This commit is contained in:
Qihang Zhang 2025-04-04 16:48:59 +08:00
commit 18b3ee16e3
4 changed files with 264 additions and 0 deletions

53
compare_colume.py Normal file
View File

@ -0,0 +1,53 @@
import pandas as pd
def compare_excel_columns(file_a, file_b, sheet_a=0, sheet_b=0):
"""
对比两个Excel文件中指定列的数据是否完全相同
参数:
file_a: 表A的文件路径
file_b: 表B的文件路径
sheet_a: 表A的工作表索引或名称(默认为第一个工作表)
sheet_b: 表B的工作表索引或名称(默认为第一个工作表)
"""
try:
# 读取Excel文件
df_a = pd.read_excel(file_a, sheet_name=sheet_a, header=None)
df_b = pd.read_excel(file_b, sheet_name=sheet_b, header=None)
# 提取F列从F4(即索引3行)开始的数据(注意: pandas列F是第5列索引为4)
col_a = df_a.iloc[3:, 5].reset_index(drop=True)
# 提取G列从G4(即索引3行)开始的数据(注意: pandas列G是第6列索引为6)
col_b = df_b.iloc[3:, 5].reset_index(drop=True)
# 比较长度
if len(col_a) != len(col_b):
print(f"数据长度不一致: 表A有{len(col_a)}表B有{len(col_b)}")
return False
# 比较内容
comparison = col_a == col_b
if comparison.all():
print("两列数据完全相同")
return True
else:
# 找出不同的行
diff_indices = comparison[comparison == False].index
print(f"发现{len(diff_indices)}处不同:")
for idx in diff_indices:
print(f"{idx + 4}: 表A值='{col_a[idx]}', 表B值='{col_b[idx]}'")
return False
except Exception as e:
print(f"发生错误: {str(e)}")
return False
# 使用示例
if __name__ == "__main__":
file_a = "4_2/全国统计系统会议接送表 准1 - 副本(1).xls" # 替换为你的表A文件路径
file_b = "4_2/4.2)全国统计系统办公室工作会议参会人员报名表 - 副本(1).xls" # 替换为你的表B文件路径
result = compare_excel_columns(file_a, file_b)
print("对比结果:", "相同" if result else "不同")

32
extract_time.py Normal file
View File

@ -0,0 +1,32 @@
import xlrd
from openpyxl import Workbook
import re
# 读取 .xls 文件
input_file = "全国统计系统会议接送表.xls"
workbook = xlrd.open_workbook(input_file)
sheet = workbook.sheet_by_index(0) # 第一个工作表
# 创建新的 .xlsx 文件用于写入
output_workbook = Workbook()
output_sheet = output_workbook.active
# 复制原数据到新文件(保持格式)
for row in range(sheet.nrows):
for col in range(sheet.ncols):
output_sheet.cell(row=row + 1, column=col + 1, value=sheet.cell_value(row, col))
# 提取 F 列时间并写入 M 列
for row in range(3, sheet.nrows): # 从第4行开始索引从0开始
cell_value = sheet.cell_value(row, 5) # F列是第6列xlrd列索引从0开始
if cell_value:
# 提取时间(匹配 14:25、1635 等格式)
time_match = re.search(r'(\d{1,2}[:]\d{2})', str(cell_value))
if time_match:
extracted_time = time_match.group(1).replace('', ':') # 统一时间格式
output_sheet.cell(row=row + 1, column=13, value=extracted_time) # M列是第13列
# 保存为新的 .xlsx 文件
output_file = "全国统计系统会议接送表_更新.xlsx"
output_workbook.save(output_file)
print(f"处理完成,结果已保存至: {output_file}")

61
sort_time.py Normal file
View File

@ -0,0 +1,61 @@
import xlrd
import xlwt
import re
from datetime import datetime
# 读取 .xls 文件
input_file = "全国统计系统会议接送表.xls"
workbook = xlrd.open_workbook(input_file)
sheet = workbook.sheet_by_index(0)
# 创建新的 .xls 文件
output_workbook = xlwt.Workbook()
output_sheet = output_workbook.add_sheet("Sheet1")
# 复制表头
for col in range(sheet.ncols):
output_sheet.write(0, col, sheet.cell_value(0, col))
output_sheet.write(1, col, sheet.cell_value(1, col))
output_sheet.write(2, col, sheet.cell_value(2, col))
# 添加新列标题"到达时间"到M列
output_sheet.write(2, 12, "到达时间")
# 准备数据行并提取时间
data_rows = []
for row in range(3, sheet.nrows):
row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)]
cell_value = sheet.cell_value(row, 5) # F列是第6列xlrd列索引从0开始
extracted_time = None
if cell_value:
time_match = re.search(r'(\d{1,2}[:]\d{2})', str(cell_value))
if time_match:
extracted_time = time_match.group(1).replace('', ':')
# 转换为时间对象便于排序
try:
time_obj = datetime.strptime(extracted_time, "%H:%M").time()
except ValueError:
time_obj = None
else:
time_obj = None
else:
time_obj = None
row_data.append(extracted_time) # 添加提取的时间字符串到行数据
data_rows.append((time_obj, row, row_data)) # 存储时间对象、原行号和行数据
# 排序:先按时间排序,没有时间的放在最后
data_rows.sort(key=lambda x: (x[0] is not None, x[0] or datetime.min.time()))
# 写入排序后的数据
output_row = 3
for time_obj, original_row, row_data in data_rows:
for col in range(len(row_data)):
output_sheet.write(output_row, col, row_data[col])
output_row += 1
# 保存
output_file = "全国统计系统会议接送表_排序.xls"
output_workbook.save(output_file)
print(f"处理完成,结果已保存至: {output_file}")

118
split_sheet.py Normal file
View File

@ -0,0 +1,118 @@
import xlrd
import xlwt
import re
from datetime import datetime
def extract_time(cell_value):
"""从单元格内容中提取时间"""
if cell_value:
time_match = re.search(r'(\d{1,2}[:]\d{2})', str(cell_value))
if time_match:
return time_match.group(1).replace('', ':')
return None
def is_airport(cell_value):
"""判断是否是机场到达包含T1/T2/T3/机场/航站楼等关键词)"""
if not cell_value:
return False
return any(keyword in str(cell_value) for keyword in ['T1', 'T2', 'T3', '机场', '航站楼'])
def is_train_station(cell_value):
"""判断是否是火车站到达(包含站/高铁等关键词)"""
if not cell_value:
return False
return any(keyword in str(cell_value) for keyword in ['', '高铁', 'G', '北站', '贵阳站'])
# 读取原始文件
input_file = "4_2/全国统计系统会议接送表(1)(1).xls"
workbook = xlrd.open_workbook(input_file)
sheet = workbook.sheet_by_index(0)
# 创建两个输出工作簿
airplane_workbook = xlwt.Workbook()
train_workbook = xlwt.Workbook()
# 创建工作表
airplane_sheet = airplane_workbook.add_sheet("飞机到达表")
train_sheet = train_workbook.add_sheet("火车到达表")
# 准备存储数据
airplane_data = []
train_data = []
# 处理表头
headers = [sheet.cell_value(row, col) for row in range(3) for col in range(sheet.ncols)]
header_row_count = 3 # 原表有3行表头
# 复制表头到两个工作表
for row in range(header_row_count):
for col in range(sheet.ncols):
airplane_sheet.write(row, col, sheet.cell_value(row, col))
train_sheet.write(row, col, sheet.cell_value(row, col))
# 处理数据行
for row in range(header_row_count, sheet.nrows):
row_data = [sheet.cell_value(row, col) for col in range(sheet.ncols)]
cell_value = sheet.cell_value(row, 5) # F列
extracted_time = extract_time(cell_value)
# 转换为时间对象用于排序
time_obj = None
if extracted_time:
try:
time_obj = datetime.strptime(extracted_time, "%H:%M").time()
except ValueError:
pass
# 添加提取的时间到行数据
row_data.append(extracted_time)
# 根据到达方式分类
if is_airport(cell_value):
airplane_data.append((time_obj, row_data))
elif is_train_station(cell_value):
train_data.append((time_obj, row_data))
else:
# 无法识别的到达方式,根据用户需求决定放在哪个表
# 这里默认放入火车到达表
train_data.append((time_obj, row_data))
# 排序函数
def sort_key(item):
time_obj, row_data = item
return (time_obj is not None, time_obj or datetime.min.time())
# 对两个数据集分别排序
airplane_data.sort(key=sort_key)
train_data.sort(key=sort_key)
# 写入飞机到达表
output_row = header_row_count
for time_obj, row_data in airplane_data:
for col in range(len(row_data)):
airplane_sheet.write(output_row, col, row_data[col])
output_row += 1
# 写入火车到达表
output_row = header_row_count
for time_obj, row_data in train_data:
for col in range(len(row_data)):
train_sheet.write(output_row, col, row_data[col])
output_row += 1
# 保存文件
airplane_file = "飞机到达表.xls"
train_file = "火车到达表.xls"
airplane_workbook.save(airplane_file)
train_workbook.save(train_file)
print(f"处理完成,飞机到达表已保存至: {airplane_file}")
print(f"处理完成,火车到达表已保存至: {train_file}")