feat(news_analyze.py): 支持缓存分析结果并提取重要新闻原文

This commit is contained in:
Qihang Zhang 2025-04-09 19:10:23 +08:00
parent eb776fd3cb
commit 601a940882

View File

@ -146,14 +146,34 @@ def get_news_by_date(date_str, force_update=False):
return None
def analyze_news_in_batches(news_df):
"""将新闻按批次分析后汇总"""
def analyze_news_in_batches(news_df, date_str=None, force_analyze=False):
"""将新闻按批次分析后汇总,支持缓存分析结果"""
if news_df is None or news_df.empty:
return None
# 确保temp目录存在
temp_dir = ensure_temp_dir()
config = load_config()
openai_config = config['openai_api']
# 如果提供了日期,尝试读取缓存的分析结果
cache_file = None
if date_str:
date_file = datetime.datetime.strptime(date_str, '%Y-%m-%d').strftime('%Y%m%d')
cache_file = os.path.join(temp_dir, f"analysis_cache_{date_file}.json")
# 检查是否有缓存,且不强制重新分析
if os.path.exists(cache_file) and not force_analyze:
try:
import json
with open(cache_file, 'r', encoding='utf-8') as f:
cached_analysis = json.load(f)
print(f"使用缓存的分析结果: {cache_file}")
return cached_analysis
except Exception as e:
print(f"读取缓存文件时出错: {e}")
# 初始化 OpenAI 客户端
client = OpenAI(
api_key=openai_config['api_key'],
@ -164,6 +184,8 @@ def analyze_news_in_batches(news_df):
batch_size = 200
total_batches = math.ceil(len(news_df) / batch_size)
batch_results = []
# 保存每批次的重要新闻原文
important_news = []
print(f"{len(news_df)}条新闻分成{total_batches}批进行分析,每批{batch_size}")
@ -171,12 +193,41 @@ def analyze_news_in_batches(news_df):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, len(news_df))
batch_df = news_df.iloc[start_idx:end_idx]
print(f"分析第{i + 1}/{total_batches}批新闻...")
# 生成缓存文件名,按批次缓存
batch_cache_file = None
if date_str:
date_file = datetime.datetime.strptime(date_str, '%Y-%m-%d').strftime('%Y%m%d')
batch_cache_file = os.path.join(temp_dir, f"batch_analysis_cache_{date_file}_{i + 1}.json")
# 检查是否有批次缓存,且不强制重新分析
if os.path.exists(batch_cache_file) and not force_analyze:
try:
import json
with open(batch_cache_file, 'r', encoding='utf-8') as f:
cached_batch = json.load(f)
print(f"使用缓存的批次分析结果: {batch_cache_file}")
batch_results.append(cached_batch)
# 添加此批次的重要新闻到列表
if "important_news" in cached_batch:
important_news.extend(cached_batch["important_news"])
continue
except Exception as e:
print(f"读取批次缓存文件时出错: {e}")
# 合并当前批次新闻内容
batch_content = ""
batch_news = [] # 存储此批次的新闻
for _, row in batch_df.iterrows():
news_item = {
"datetime": row['datetime'],
"content": row['content']
}
batch_news.append(news_item)
batch_content += f"{row['datetime']}: {row['content']}\n\n"
# 对当前批次进行初步分析
@ -185,13 +236,25 @@ def analyze_news_in_batches(news_df):
{batch_content}
总结以下两点:
完成以下两项分析:
1. 这批新闻中提到的重要政策特别是中国颁布的法令等如果没有相关政策可以直接说明
2. 这些政策可能对哪些股市板块带来影响利好/利空
请只提取重要信息简明扼要地回答
"""
对于重要的政策新闻请在分析后提供一个"重要新闻列表"包含新闻的原文和发布时间
请只提取重要信息过滤掉不相关的新闻如普通的股价波动ETF变动等
回复格式:
## 政策分析
[您的分析内容]
## 板块影响
[您的影响分析]
## 重要新闻列表
1. [时间] [完整新闻原文]
2. [时间] [完整新闻原文]
...
"""
try:
response = client.chat.completions.create(
messages=[
@ -203,14 +266,43 @@ def analyze_news_in_batches(news_df):
model=openai_config['model'],
temperature=0.2
)
batch_analysis = response.choices[0].message.content
batch_results.append({
# 提取重要新闻
batch_important_news = []
if "## 重要新闻列表" in batch_analysis:
news_section = batch_analysis.split("## 重要新闻列表", 1)[1].strip()
import re
# 匹配时间和新闻内容
news_matches = re.findall(r'\d+\.\s+\[([^\]]+)\]\s+(.+?)(?=\n\d+\.|$)', news_section, re.DOTALL)
for time_str, content in news_matches:
batch_important_news.append({
"datetime": time_str.strip(),
"content": content.strip()
})
batch_result = {
"batch_number": i + 1,
"news_count": len(batch_df),
"analysis": batch_analysis
})
"analysis": batch_analysis,
"important_news": batch_important_news
}
batch_results.append(batch_result)
# 添加重要新闻到总列表
important_news.extend(batch_important_news)
# 保存批次缓存
if batch_cache_file:
try:
import json
with open(batch_cache_file, 'w', encoding='utf-8') as f:
json.dump(batch_result, f, ensure_ascii=False, indent=2)
print(f"已缓存批次分析结果到: {batch_cache_file}")
except Exception as e:
print(f"保存批次缓存文件时出错: {e}")
except Exception as e:
print(f"分析第{i + 1}批新闻时出错: {e}")
@ -236,6 +328,7 @@ def analyze_news_in_batches(news_df):
- 提取3-5条最重要的国家层面政策特别是中国的宏观经济政策
- 每条政策用一句话概括核心内容
- 标注政策发布部门/会议名称
- 对每条重要政策提供对应的原始新闻来源
2. 板块影响分析
- 对每条重要政策分析直接影响的相关行业板块
@ -251,17 +344,17 @@ def analyze_news_in_batches(news_df):
### 一、核心政策摘要
1. [政策部门/会议] 政策核心内容
- 来源新闻: [对应的原始新闻]
- 影响板块利好XXX利空XXX
- 影响逻辑...
2. [政策部门/会议] 政策核心内容
- 来源新闻: [对应的原始新闻]
- 影响板块利好XXX利空XXX
- 影响逻辑...
### 二、综合市场影响
[整体分析包含市场情绪预判和关键时间节点提醒]
"""
try:
final_response = client.chat.completions.create(
messages=[
@ -273,16 +366,29 @@ def analyze_news_in_batches(news_df):
model=openai_config['model'],
temperature=0.2
)
final_analysis = final_response.choices[0].message.content
return {
analysis_result = {
"news_count": len(news_df),
"batch_count": total_batches,
"time_range": f"{news_df['datetime'].iloc[0]}{news_df['datetime'].iloc[-1]}" if not news_df.empty else "",
"batch_results": batch_results,
"final_analysis": final_analysis
"final_analysis": final_analysis,
"important_news": important_news # 保存所有重要新闻原文
}
# 保存完整分析缓存
if cache_file:
try:
import json
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(analysis_result, f, ensure_ascii=False, indent=2)
print(f"已缓存完整分析结果到: {cache_file}")
except Exception as e:
print(f"保存缓存文件时出错: {e}")
return analysis_result
except Exception as e:
print(f"生成最终分析时出错: {e}")
return None
@ -308,31 +414,53 @@ def compare_trend_analysis(yesterday_analysis, today_analysis):
base_url=openai_config['base_url']
)
# 提取今天和昨天的重要新闻
yesterday_news = yesterday_analysis.get('important_news', [])
today_news = today_analysis.get('important_news', [])
# 整理新闻字符串
yesterday_news_str = ""
for i, news in enumerate(yesterday_news):
yesterday_news_str += f"{i + 1}. [{news['datetime']}] {news['content']}\n\n"
today_news_str = ""
for i, news in enumerate(today_news):
today_news_str += f"{i + 1}. [{news['datetime']}] {news['content']}\n\n"
# 构建对比分析提示
prompt = f"""
prompt = f"""
你是一位资深财经分析师请对比昨天和今天的政策分析结果找出从弱势转为强势的板块分析未来市场趋势
### 昨天的分析:
{yesterday_analysis['final_analysis']}
### 昨天的重要新闻原文:
{yesterday_news_str}
### 今天的分析:
{today_analysis['final_analysis']}
### 今天的重要新闻原文:
{today_news_str}
请完成以下分析
1. 弱转强板块识别
- 对比昨天的弱势板块和今天的强势板块找出明显从弱势转为强势的板块
- 分析这些板块转强的原因
- 评估这种转变的持续性
- 关联对应的原始新闻作为判断依据
2. 政策趋势分析
- 分析昨天和今天的政策提取政策方向的变化或延续
- 判断政策组合对未来市场的影响
- 预测可能被持续关注和推动的行业方向
- 引用相关新闻作为证据
3. 投资机会预判
- 基于弱转强的板块和政策趋势预判未来1-2周可能出现的投资机会
- 提出值得重点关注的细分行业
- 提供相关的政策和新闻支持
请用以下结构化格式输出
@ -341,19 +469,20 @@ def compare_trend_analysis(yesterday_analysis, today_analysis):
- 转变原因...
- 相关政策支持...
- 持续性评估...
- 关键新闻依据[引用原始新闻]
2. [板块名称]
- 转变原因...
- 相关政策支持...
- 持续性评估...
- 关键新闻依据[引用原始新闻]
### 二、政策趋势变化
[分析政策方向的延续性和变化]
[分析政策方向的延续性和变化引用相关新闻作为证据]
### 三、投资机会与建议
[提出具体的投资方向和关注重点]
[提出具体的投资方向和关注重点引用相关政策和新闻]
"""
try:
response = client.chat.completions.create(
messages=[
@ -365,13 +494,14 @@ def compare_trend_analysis(yesterday_analysis, today_analysis):
model=openai_config['model'],
temperature=0.3
)
trend_analysis = response.choices[0].message.content
return {
"yesterday_date": yesterday_analysis.get("time_range", "昨天"),
"today_date": today_analysis.get("time_range", "今天"),
"trend_analysis": trend_analysis
"trend_analysis": trend_analysis,
"yesterday_news": yesterday_news,
"today_news": today_news
}
except Exception as e:
print(f"生成趋势分析时出错: {e}")
@ -391,7 +521,6 @@ def main():
# 获取昨天的新闻,只有在不存在时才获取
yesterday_file_exists = False
temp_dir = ensure_temp_dir()
for file in os.listdir(temp_dir):
if file.startswith(f"{yesterday_file}_") and file.endswith(".csv"):
yesterday_file_exists = True
@ -407,7 +536,6 @@ def main():
for file in os.listdir(temp_dir):
if file.startswith(f"{yesterday_file}_") and file.endswith(".csv"):
yesterday_files.append(os.path.join(temp_dir, file))
dfs = []
for file in yesterday_files:
try:
@ -415,65 +543,86 @@ def main():
dfs.append(df)
except Exception as e:
print(f"读取文件{file}时出错: {e}")
if dfs:
yesterday_news = pd.concat(dfs, ignore_index=True)
if 'datetime' in yesterday_news.columns and 'content' in yesterday_news.columns:
yesterday_news = yesterday_news[['datetime', 'content']]
# 分析今天和昨天的新闻
# 分析今天和昨天的新闻,使用缓存机制
today_analysis = None
yesterday_analysis = None
if today_news is not None:
print(f"获取到 {len(today_news)} 条今日新闻,正在分析...")
today_analysis = analyze_news_in_batches(today_news)
today_analysis = analyze_news_in_batches(today_news, today, force_analyze=False)
# 保存今天的原始新闻和分析结果
today_news.to_csv(f"news_raw_{today_file}.csv", index=False, encoding='utf-8-sig')
today_news.to_csv(f"temp/news_raw_{today_file}.csv", index=False, encoding='utf-8-sig')
if today_analysis:
with open(f"news_analysis_{today_file}.txt", "w", encoding="utf-8") as f:
with open(f"temp/news_analysis_{today_file}.txt", "w", encoding="utf-8") as f:
f.write(
f"分析范围: {today_analysis['news_count']} 条新闻,分成 {today_analysis['batch_count']} 批处理\n")
f.write(f"时间范围: {today_analysis['time_range']}\n\n")
f.write("最终分析结果:\n")
f.write("-" * 80 + "\n")
f.write(today_analysis['final_analysis'])
f.write("\n" + "-" * 80 + "\n")
f.write("\n" + "-" * 80 + "\n\n")
print(f"今日分析结果已保存到 news_analysis_{today_file}.txt")
# 添加重要新闻原文
f.write("重要新闻原文:\n")
f.write("-" * 80 + "\n")
for i, news in enumerate(today_analysis.get('important_news', [])):
f.write(f"{i + 1}. [{news['datetime']}] {news['content']}\n\n")
f.write("-" * 80 + "\n")
print(f"今日分析结果已保存到 temp/news_analysis_{today_file}.txt")
else:
print("无法获取今天的新闻数据")
if yesterday_news is not None:
print(f"获取到 {len(yesterday_news)} 条昨日新闻,正在分析...")
yesterday_analysis = analyze_news_in_batches(yesterday_news)
yesterday_analysis = analyze_news_in_batches(yesterday_news, yesterday, force_analyze=False)
# 保存昨天的原始新闻和分析结果
yesterday_news.to_csv(f"news_raw_{yesterday_file}.csv", index=False, encoding='utf-8-sig')
yesterday_news.to_csv(f"temp/news_raw_{yesterday_file}.csv", index=False, encoding='utf-8-sig')
if yesterday_analysis:
with open(f"news_analysis_{yesterday_file}.txt", "w", encoding="utf-8") as f:
with open(f"temp/news_analysis_{yesterday_file}.txt", "w", encoding="utf-8") as f:
f.write(
f"分析范围: {yesterday_analysis['news_count']} 条新闻,分成 {yesterday_analysis['batch_count']} 批处理\n")
f.write(f"时间范围: {yesterday_analysis['time_range']}\n\n")
f.write("最终分析结果:\n")
f.write("-" * 80 + "\n")
f.write(yesterday_analysis['final_analysis'])
f.write("\n" + "-" * 80 + "\n")
f.write("\n" + "-" * 80 + "\n\n")
print(f"昨日分析结果已保存到 news_analysis_{yesterday_file}.txt")
# 添加重要新闻原文
f.write("重要新闻原文:\n")
f.write("-" * 80 + "\n")
for i, news in enumerate(yesterday_analysis.get('important_news', [])):
f.write(f"{i + 1}. [{news['datetime']}] {news['content']}\n\n")
f.write("-" * 80 + "\n")
print(f"昨日分析结果已保存到 temp/news_analysis_{yesterday_file}.txt")
else:
# 尝试读取昨天的分析结果文件
try:
with open(f"news_analysis_{yesterday_file}.txt", "r", encoding="utf-8") as f:
content = f.read()
yesterday_analysis = {
"final_analysis": content,
"time_range": yesterday
}
print(f"已从文件中读取昨日分析结果")
# 首先尝试读取JSON缓存文件
cache_file = os.path.join(temp_dir, f"analysis_cache_{yesterday_file}.json")
if os.path.exists(cache_file):
import json
with open(cache_file, 'r', encoding='utf-8') as f:
yesterday_analysis = json.load(f)
print(f"已从缓存中读取昨日分析结果")
else:
# 如果没有缓存,尝试读取文本文件
with open(f"temp/news_analysis_{yesterday_file}.txt", "r", encoding="utf-8") as f:
content = f.read()
yesterday_analysis = {
"final_analysis": content.split("-" * 80)[1].strip(),
"time_range": yesterday,
"important_news": [] # 从文本文件中难以结构化提取新闻,置为空列表
}
print(f"已从文本文件中读取昨日分析结果")
except Exception as e:
print(f"无法读取昨日分析结果: {e}")
@ -481,17 +630,29 @@ def main():
if today_analysis and yesterday_analysis:
print("正在进行今日与昨日的趋势对比分析...")
trend_analysis = compare_trend_analysis(yesterday_analysis, today_analysis)
if trend_analysis:
# 保存趋势分析结果
with open(f"trend_analysis_{today_file}.txt", "w", encoding="utf-8") as f:
with open(f"temp/trend_analysis_{today_file}.txt", "w", encoding="utf-8") as f:
f.write(f"对比分析范围: 昨日({yesterday}) 与 今日({today})\n\n")
f.write("趋势分析结果:\n")
f.write("-" * 80 + "\n")
f.write(trend_analysis['trend_analysis'])
f.write("\n" + "-" * 80 + "\n")
f.write("\n" + "-" * 80 + "\n\n")
print(f"趋势分析结果已保存到 trend_analysis_{today_file}.txt")
# 添加弱转强的新闻原文供参考
f.write("昨日重要新闻原文:\n")
f.write("-" * 80 + "\n")
for i, news in enumerate(trend_analysis.get('yesterday_news', [])):
f.write(f"{i + 1}. [{news['datetime']}] {news['content']}\n\n")
f.write("-" * 80 + "\n\n")
f.write("今日重要新闻原文:\n")
f.write("-" * 80 + "\n")
for i, news in enumerate(trend_analysis.get('today_news', [])):
f.write(f"{i + 1}. [{news['datetime']}] {news['content']}\n\n")
f.write("-" * 80 + "\n")
print(f"趋势分析结果已保存到 temp/trend_analysis_{today_file}.txt")
# 打印趋势分析结果
print("\n=== 今日与昨日趋势对比分析 ===\n")