first commit

This commit is contained in:
zms 2025-05-21 11:33:39 +08:00
commit c3962e21b9
9 changed files with 4044 additions and 0 deletions

2854
rain_forcast_3h.py Normal file

File diff suppressed because it is too large Load Diff

8
requirements.txt Normal file
View File

@ -0,0 +1,8 @@
numpy>=1.21.0
pandas>=1.3.0
scikit-learn>=0.24.2
xgboost>=1.4.2
matplotlib>=3.4.2
seaborn>=0.11.1
pymysql>=1.0.2
joblib>=1.0.1

142
src/data/data_processor.py Normal file
View File

@ -0,0 +1,142 @@
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from ..utils.feature_engineering import FeatureEngineering
class DataProcessor:
def __init__(self, debug=True):
"""
初始化数据处理器
"""
self.debug = debug
self.scaler = StandardScaler()
self.feature_columns = None
self.target_columns = None
def debug_print(self, message):
"""
调试信息打印
"""
if self.debug:
print(message)
def preprocess_data(self, data, silent=False):
"""
数据预处理
"""
if not silent:
self.debug_print("开始数据预处理...")
# 复制数据以避免修改原始数据
df = data.copy()
# 确保datetime列是datetime类型
if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
df['datetime'] = pd.to_datetime(df['datetime'])
# 处理缺失值
df = self._handle_missing_values(df)
# 特征工程
df = FeatureEngineering.process_features(df)
# 移除包含NaN的行
df = df.dropna()
if not silent:
self.debug_print(f"预处理完成,数据形状: {df.shape}")
return df
def _handle_missing_values(self, df):
"""
处理缺失值
"""
# 对于时间序列数据,使用前向填充
df = df.fillna(method='ffill')
# 对于剩余的缺失值,使用列的中位数填充
df = df.fillna(df.median())
return df
def select_features(self, data, n_features=30, silent=False):
"""
特征选择
"""
if not silent:
self.debug_print(f"选择前{n_features}个特征...")
# 计算特征重要性
feature_importance = self._calculate_feature_importance(data)
# 选择最重要的特征
selected_features = feature_importance.nlargest(n_features).index.tolist()
if not silent:
self.debug_print(f"选择的特征: {selected_features}")
return selected_features
def _calculate_feature_importance(self, data):
"""
计算特征重要性
"""
# 这里使用一个简单的基于相关性的特征重要性计算
# 在实际应用中,你可能需要使用更复杂的方法
correlations = data.corr()['rainfall'].abs()
return correlations.drop('rainfall')
def split_data(self, X, y_binary, y_amount, y_category=None, time_based=False, test_size=0.2):
"""
数据分割
"""
if time_based:
# 基于时间的分割
split_idx = int(len(X) * (1 - test_size))
X_train = X[:split_idx]
X_test = X[split_idx:]
y_train_binary = y_binary[:split_idx]
y_test_binary = y_binary[split_idx:]
y_train_amount = y_amount[:split_idx]
y_test_amount = y_amount[split_idx:]
if y_category is not None:
y_train_category = y_category[:split_idx]
y_test_category = y_category[split_idx:]
else:
y_train_category = None
y_test_category = None
else:
# 随机分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_binary, y_test_binary, y_train_amount, y_test_amount = train_test_split(
X, y_binary, y_amount, test_size=test_size, random_state=42
)
if y_category is not None:
_, _, y_train_category, y_test_category = train_test_split(
X, y_category, test_size=test_size, random_state=42
)
else:
y_train_category = None
y_test_category = None
return X_train, X_test, y_train_binary, y_test_binary, y_train_amount, y_test_amount, y_train_category, y_test_category
def scale_features(self, X_train, X_test):
"""
特征缩放
"""
# 保存特征列名
self.feature_columns = X_train.columns
# 缩放特征
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
# 转换回DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=self.feature_columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=self.feature_columns)
return X_train_scaled, X_test_scaled

View File

@ -0,0 +1,236 @@
import pandas as pd
import pymysql
from datetime import datetime, timedelta
class DatabaseConnector:
def __init__(self, rtk_host='localhost', rtk_user='root', rtk_password='root', rtk_database='rtk_data',
rain_host='8.134.185.53', rain_user='remote', rain_password='root', rain_database='rain_db'):
"""
初始化数据库连接器
"""
# RTK数据库配置
self.rtk_host = 'localhost'
self.rtk_user = 'root'
self.rtk_password = 'root'
self.rtk_database = 'rtk_data'
self.rtk_connection = None
# 气象数据库配置
self.rain_host = '8.134.185.53'
self.rain_user = 'remote'
self.rain_password = 'root'
self.rain_database = 'rain_db'
self.rain_connection = None
def connect_rtk(self):
"""
建立RTK数据库连接
"""
try:
self.rtk_connection = pymysql.connect(
host=self.rtk_host,
user=self.rtk_user,
password=self.rtk_password,
database=self.rtk_database
)
return True
except Exception as e:
print(f"RTK数据库连接失败: {str(e)}")
return False
def connect_rain(self):
"""
建立气象数据库连接
"""
try:
self.rain_connection = pymysql.connect(
host=self.rain_host,
user=self.rain_user,
password=self.rain_password,
database=self.rain_database
)
return True
except Exception as e:
print(f"气象数据库连接失败: {str(e)}")
return False
def disconnect(self):
"""
关闭所有数据库连接
"""
if self.rtk_connection:
self.rtk_connection.close()
self.rtk_connection = None
if self.rain_connection:
self.rain_connection.close()
self.rain_connection = None
def get_latest_weather_data(self):
"""
获取最新的气象数据合并RTK和气象数据的最新记录
使用RTK数据的时间作为基准时间
"""
# 获取RTK数据
if not self.rtk_connection:
if not self.connect_rtk():
return None
try:
# 获取RTK数据最新记录
rtk_query = """
SELECT
timestamp,
pwv,
ztd,
ztd_gradient_east,
ztd_gradient_north
FROM rtk_data
ORDER BY timestamp DESC
LIMIT 1
"""
rtk_df = pd.read_sql(rtk_query, self.rtk_connection)
if rtk_df.empty:
print("RTK数据库中没有数据")
return None
# 获取气象数据最新记录
if not self.rain_connection:
if not self.connect_rain():
print("无法连接到气象数据库")
return rtk_df
rain_query = """
SELECT
wind_speed,
wind_force,
wind_direction_360 as wind_direction,
humidity,
temperature,
atm_pressure,
solar_radiation,
rainfall
FROM sensor_data
ORDER BY timestamp DESC
LIMIT 1
"""
rain_df = pd.read_sql(rain_query, self.rain_connection)
if rain_df.empty:
print("气象数据库中没有数据")
return rtk_df
# 重命名RTK数据列
rtk_df = rtk_df.rename(columns={
'timestamp': 'datetime',
'pwv': 'PWV',
'ztd': 'ZTD'
})
# 重命名气象数据列
rain_df = rain_df.rename(columns={
'humidity': 'rh',
'temperature': 'Temp',
'atm_pressure': 'Press',
'wind_speed': 'ws'
})
# 将RTK的时间戳转换为datetime类型
rtk_df['datetime'] = pd.to_datetime(rtk_df['datetime'])
# 合并数据使用RTK的时间
merged_df = pd.concat([rtk_df, rain_df], axis=1)
# 删除重复的列(如果有的话)
merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]
return merged_df
except Exception as e:
print(f"获取数据失败: {str(e)}")
return None
def get_historical_data(self, start_date, end_date):
"""
获取历史数据合并RTK和气象数据
"""
# 获取RTK数据
if not self.rtk_connection:
if not self.connect_rtk():
return None
try:
# 获取RTK数据
rtk_query = """
SELECT
timestamp,
pwv,
ztd,
ztd_gradient_east,
ztd_gradient_north
FROM rtk_data
WHERE timestamp BETWEEN %s AND %s
ORDER BY timestamp
"""
rtk_df = pd.read_sql(rtk_query, self.rtk_connection, params=[start_date, end_date])
# 重命名RTK数据列
rtk_df = rtk_df.rename(columns={
'timestamp': 'datetime',
'pwv': 'PWV',
'ztd': 'ZTD'
})
# 获取气象数据
if not self.rain_connection:
if not self.connect_rain():
return rtk_df # 如果气象数据库连接失败返回RTK数据
rain_query = """
SELECT
timestamp,
wind_speed,
wind_force,
wind_direction_360 as wind_direction,
humidity,
temperature,
atm_pressure,
solar_radiation,
rainfall
FROM sensor_data
WHERE timestamp BETWEEN %s AND %s
ORDER BY timestamp
"""
rain_df = pd.read_sql(rain_query, self.rain_connection, params=[start_date, end_date])
# 重命名气象数据列
rain_df = rain_df.rename(columns={
'timestamp': 'datetime',
'humidity': 'rh',
'temperature': 'Temp',
'atm_pressure': 'Press',
'wind_speed': 'ws'
})
# 合并数据
if not rain_df.empty:
# 将时间戳转换为datetime类型
rtk_df['datetime'] = pd.to_datetime(rtk_df['datetime'])
rain_df['datetime'] = pd.to_datetime(rain_df['datetime'])
# 合并数据
merged_df = pd.merge(rtk_df, rain_df, on='datetime', how='outer')
# 按时间排序
merged_df = merged_df.sort_values('datetime')
# 填充缺失值
merged_df = merged_df.fillna(method='ffill')
return merged_df
else:
return rtk_df
except Exception as e:
print(f"获取历史数据失败: {str(e)}")
return None

View File

@ -0,0 +1,154 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from ..models.rainfall_model import RainfallTwoStageModel
from ..data.data_processor import DataProcessor
from ..utils.feature_engineering import FeatureEngineering
class FutureRainfallForecastSystem:
def __init__(self, model_path=None, debug=False):
"""
初始化未来降雨预报系统
"""
self.debug = debug
self.forecast_models = {}
self.data_processor = DataProcessor(debug=debug)
self.model_path = model_path
def train_forecast_models(self, data, features_config=None):
"""
训练多个时间点的预测模型
"""
print("开始训练未来降雨预报模型...")
# 预处理数据
processed_data = self.data_processor.preprocess_data(data)
# 为每个预测时间点训练模型
for time_point in [1, 3, 6, 12, 24]: # 预测未来1, 3, 6, 12, 24小时
print(f"\n训练{time_point}小时预报模型...")
# 准备训练数据
future_data = self._prepare_future_data(processed_data, time_point)
# 创建和训练模型
model = RainfallTwoStageModel(debug=self.debug)
model.train_and_evaluate(future_data)
# 保存模型
self.forecast_models[time_point] = model
print(f"{time_point}小时预报模型训练完成")
def _prepare_future_data(self, data, time_point):
"""
准备未来时间点的训练数据
"""
# 创建时间偏移
future_data = data.copy()
future_data['datetime'] = future_data['datetime'] + timedelta(hours=time_point)
# 合并当前数据和未来数据
merged_data = pd.merge(
data,
future_data[['datetime', 'rainfall']],
on='datetime',
suffixes=('', '_future')
)
# 重命名目标变量
merged_data = merged_data.rename(columns={'rainfall_future': 'rainfall'})
return merged_data
def forecast(self, current_data):
"""
进行未来降雨预报
"""
if not self.forecast_models:
raise ValueError("没有可用的预报模型,请先训练模型")
# 预处理当前数据
processed_data = self.data_processor.preprocess_data(current_data)
# 存储预报结果
forecast_results = {}
# 对每个时间点进行预报
for time_point, model in self.forecast_models.items():
# 准备预报数据
forecast_data = self._prepare_future_data(processed_data, time_point)
# 进行预报
predictions = model.predict(forecast_data)
# 存储结果
forecast_results[time_point] = predictions
return forecast_results
def load_models(self, base_path):
"""
加载预报模型
"""
if not os.path.exists(base_path):
raise FileNotFoundError(f"模型目录不存在: {base_path}")
for time_point in [1, 3, 6, 12, 24]:
model_path = os.path.join(base_path, f'model_{time_point}h.joblib')
if os.path.exists(model_path):
try:
model = RainfallTwoStageModel.load_model(model_path)
self.forecast_models[time_point] = model
print(f"成功加载{time_point}小时预报模型")
except Exception as e:
print(f"加载{time_point}小时预报模型失败: {str(e)}")
def save_models(self, base_path):
"""
保存预报模型
"""
os.makedirs(base_path, exist_ok=True)
for time_point, model in self.forecast_models.items():
model_path = os.path.join(base_path, f'model_{time_point}h.joblib')
model.save_model(model_path)
print(f"保存{time_point}小时预报模型到: {model_path}")
def visualize_forecast(self, forecast_results, output_dir='forecast_visualizations'):
"""
可视化预报结果
"""
import matplotlib.pyplot as plt
os.makedirs(output_dir, exist_ok=True)
# 创建预报时间序列
time_points = sorted(forecast_results.keys())
rain_probs = [results['rain_probability'].mean() for results in forecast_results.values()]
rain_amounts = [results['rainfall_amount'].mean() for results in forecast_results.values()]
# 绘制降雨概率
plt.figure(figsize=(10, 6))
plt.plot(time_points, rain_probs, 'b-', label='降雨概率')
plt.fill_between(time_points, 0, rain_probs, alpha=0.2)
plt.xlabel('预报时间(小时)')
plt.ylabel('降雨概率')
plt.title('未来降雨概率预报')
plt.grid(True)
plt.legend()
plt.savefig(os.path.join(output_dir, 'rain_probability.png'))
plt.close()
# 绘制降雨量
plt.figure(figsize=(10, 6))
plt.plot(time_points, rain_amounts, 'r-', label='预计降雨量')
plt.fill_between(time_points, 0, rain_amounts, alpha=0.2)
plt.xlabel('预报时间(小时)')
plt.ylabel('降雨量 (mm)')
plt.title('未来降雨量预报')
plt.grid(True)
plt.legend()
plt.savefig(os.path.join(output_dir, 'rainfall_amount.png'))
plt.close()

232
src/main.py Normal file
View File

@ -0,0 +1,232 @@
import argparse
import os
from datetime import datetime
import pandas as pd
from database.db_connector import DatabaseConnector
from data.data_processor import DataProcessor
from models.rainfall_model import RainfallTwoStageModel
from forecast.forecast_system import FutureRainfallForecastSystem
from utils.visualization import Visualization
def train_models(data_file, model_dir, n_features=35, clf_model='ensemble', reg_model='gbdt', time_based=True):
"""
训练模型
"""
print(f"加载训练数据: {data_file}")
try:
# 读取历史数据
historical_data = pd.read_csv(data_file)
# 确保时间列是日期时间格式
if 'datetime' in historical_data.columns:
historical_data['datetime'] = pd.to_datetime(historical_data['datetime'])
# 创建预报系统
forecast_system = FutureRainfallForecastSystem()
# 训练模型
forecast_system.train_forecast_models(historical_data)
# 保存模型
forecast_system.save_models(model_dir)
print("模型训练完成")
except Exception as e:
print(f"训练模型时出错: {str(e)}")
def run_single_prediction(input_file, model_dir):
"""
运行单次预测
"""
print("启动单次预测...")
# 检查模型目录是否存在
if not os.path.exists(model_dir):
print(f"错误: 未找到模型目录 '{model_dir}'")
print("请先使用历史数据训练模型")
return
try:
# 读取输入数据
print(f"读取输入数据: {input_file}")
input_data = pd.read_csv(input_file)
# 确保时间列是日期时间格式
if 'datetime' in input_data.columns:
input_data['datetime'] = pd.to_datetime(input_data['datetime'])
# 创建预报系统
forecast_system = FutureRainfallForecastSystem()
# 加载模型
print("加载预报模型...")
forecast_system.load_models(model_dir)
if not forecast_system.forecast_models:
print("错误: 未能加载任何模型")
return
# 进行预报
print("\n开始预测降雨情况...")
forecast_results = forecast_system.forecast(input_data)
# 可视化预报结果
print("\n生成预报可视化...")
visualization = Visualization(output_dir='single_prediction_results')
visualization.plot_forecast_results(forecast_results)
# 输出预报报告
print("\n降雨预报摘要:")
print("=" * 50)
current_time = datetime.now()
print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
for time_point, result in forecast_results.items():
print(f"\n未来{time_point}小时预报:")
print("-" * 30)
rain_prob = result['rain_probability'].mean()
rain_predicted = sum(result['rain_binary']) > 0
print(f"降雨概率: {rain_prob:.2%}")
print(f"预计降雨: {'' if rain_predicted else ''}")
if rain_predicted:
rain_amount = result['rainfall_amount'].mean()
print(f"预计降雨量: {rain_amount:.2f}mm")
# 根据降雨量给出建议
if rain_amount >= 25: # 大雨
print("建议: 避免外出,注意防范强降雨可能带来的积水和交通影响")
elif rain_amount >= 10: # 中雨
print("建议: 出门请携带雨具,道路可能湿滑,驾车注意安全")
else: # 小雨
print("建议: 带好雨具,不会对出行造成太大影响")
print("\n预测完成")
except Exception as e:
print(f"运行单次预测时出错: {str(e)}")
def run_realtime_forecast(model_dir):
"""
运行实时预报
"""
print("启动实时降雨预报系统...")
# 检查模型目录是否存在
if not os.path.exists(model_dir):
print(f"错误: 未找到模型目录 '{model_dir}'")
print("请先使用历史数据训练模型")
return
try:
# 创建数据库连接器
db_connector = DatabaseConnector()
# 获取当前气象数据
print("获取当前气象数据...")
current_weather = db_connector.get_latest_weather_data()
if current_weather is None or len(current_weather) == 0:
print("错误: 无法获取当前气象数据")
return
# 创建预报系统
forecast_system = FutureRainfallForecastSystem()
# 加载模型
print("加载预报模型...")
forecast_system.load_models(model_dir)
if not forecast_system.forecast_models:
print("错误: 未能加载任何模型")
return
# 进行预报
print("\n开始预测未来降雨情况...")
forecast_results = forecast_system.forecast(current_weather)
# 可视化预报结果
print("\n生成预报可视化...")
visualization = Visualization(output_dir='realtime_forecast_results')
visualization.plot_forecast_results(forecast_results)
# 输出预报报告
print("\n未来降雨预报摘要:")
print("=" * 50)
current_time = datetime.now()
print(f"预报时间: {current_time.strftime('%Y-%m-%d %H:%M')}")
for time_point, result in forecast_results.items():
print(f"\n未来{time_point}小时预报:")
print("-" * 30)
rain_prob = result['rain_probability'].mean()
rain_predicted = sum(result['rain_binary']) > 0
print(f"降雨概率: {rain_prob:.2%}")
print(f"预计降雨: {'' if rain_predicted else ''}")
if rain_predicted:
rain_amount = result['rainfall_amount'].mean()
print(f"预计降雨量: {rain_amount:.2f}mm")
# 根据降雨量给出建议
if rain_amount >= 25: # 大雨
print("建议: 避免外出,注意防范强降雨可能带来的积水和交通影响")
elif rain_amount >= 10: # 中雨
print("建议: 出门请携带雨具,道路可能湿滑,驾车注意安全")
else: # 小雨
print("建议: 带好雨具,不会对出行造成太大影响")
print("\n预报完成")
except Exception as e:
print(f"运行实时预报时出错: {str(e)}")
finally:
if 'db_connector' in locals():
db_connector.disconnect()
def main():
parser = argparse.ArgumentParser(description="未来降雨预报系统")
parser.add_argument('--mode', type=str, default='train',
choices=['train', 'predict', 'realtime'],
help='执行模式: train-训练模型, predict-单次预测, realtime-实时预测')
parser.add_argument('--data', type=str, default='rainfall_data_time.csv',
help='数据文件路径')
parser.add_argument('--input', type=str,
help='单次预测的输入数据文件路径')
parser.add_argument('--model_dir', type=str, default='rainfall_forecast_models',
help='模型保存/加载目录')
parser.add_argument('--n_features', type=int, default=35,
help='使用的特征数量')
parser.add_argument('--clf_model', type=str, default='ensemble',
choices=['lr', 'rf', 'gbdt', 'xgboost', 'ensemble'],
help='分类器类型')
parser.add_argument('--reg_model', type=str, default='gbdt',
choices=['rf', 'gbdt', 'ridge', 'xgboost'],
help='回归器类型')
parser.add_argument('--time_based', action='store_true',
help='是否使用基于时间的数据分割')
args = parser.parse_args()
if args.mode == 'train':
train_models(args.data, args.model_dir, args.n_features,
args.clf_model, args.reg_model, args.time_based)
elif args.mode == 'predict':
if not args.input:
print("错误: 单次预测模式需要指定输入数据文件 (--input)")
return
run_single_prediction(args.input, args.model_dir)
elif args.mode == 'realtime':
run_realtime_forecast(args.model_dir)
else:
print(f"不支持的模式: {args.mode}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,176 @@
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
import joblib
import os
class RainfallTwoStageModel:
def __init__(self, debug=True, clf_model_type='ensemble', reg_model_type='gbdt', output_dir='figures'):
"""
初始化两阶段降雨预测模型
"""
self.debug = debug
self.clf_model_type = clf_model_type
self.reg_model_type = reg_model_type
self.output_dir = output_dir
self.binary_classifier = None
self.regressor = None
self.category_classifier = None
self.threshold = 0.5
def build_binary_classifier(self):
"""
构建二分类器
"""
if self.clf_model_type == 'ensemble':
# 创建多个基础分类器
lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42)
# 创建投票分类器
self.binary_classifier = VotingClassifier(
estimators=[
('lr', lr),
('rf', rf),
('gbdt', gbdt),
('xgb', xgb_clf)
],
voting='soft'
)
elif self.clf_model_type == 'lr':
self.binary_classifier = LogisticRegression(max_iter=1000, random_state=42)
elif self.clf_model_type == 'rf':
self.binary_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
elif self.clf_model_type == 'gbdt':
self.binary_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
elif self.clf_model_type == 'xgboost':
self.binary_classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)
else:
raise ValueError(f"不支持的分类器类型: {self.clf_model_type}")
def build_regressor(self):
"""
构建回归器
"""
if self.reg_model_type == 'rf':
self.regressor = RandomForestRegressor(n_estimators=100, random_state=42)
elif self.reg_model_type == 'gbdt':
self.regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
elif self.reg_model_type == 'ridge':
self.regressor = Ridge(alpha=1.0)
elif self.reg_model_type == 'xgboost':
self.regressor = xgb.XGBRegressor(n_estimators=100, random_state=42)
else:
raise ValueError(f"不支持的回归器类型: {self.reg_model_type}")
def train_binary_classifier(self, X_train, y_train, X_val=None, y_val=None):
"""
训练二分类器
"""
if self.binary_classifier is None:
self.build_binary_classifier()
self.binary_classifier.fit(X_train, y_train)
# 优化阈值
if X_val is not None and y_val is not None:
self.optimize_threshold(X_val, y_val)
def train_regressor(self, X_train, y_train, rain_mask):
"""
训练回归器
"""
if self.regressor is None:
self.build_regressor()
# 只使用有雨的样本训练回归器
X_train_rain = X_train[rain_mask]
y_train_rain = y_train[rain_mask]
if len(X_train_rain) > 0:
self.regressor.fit(X_train_rain, y_train_rain)
def optimize_threshold(self, X_val, y_val):
"""
优化分类阈值
"""
from sklearn.metrics import f1_score
# 获取预测概率
y_pred_proba = self.binary_classifier.predict_proba(X_val)[:, 1]
# 尝试不同的阈值
thresholds = np.arange(0.1, 1.0, 0.1)
best_f1 = 0
best_threshold = 0.5
for threshold in thresholds:
y_pred = (y_pred_proba >= threshold).astype(int)
f1 = f1_score(y_val, y_pred)
if f1 > best_f1:
best_f1 = f1
best_threshold = threshold
self.threshold = best_threshold
def predict(self, X):
"""
预测降雨情况
"""
# 二分类预测
y_pred_proba = self.binary_classifier.predict_proba(X)[:, 1]
rain_mask = y_pred_proba >= self.threshold
# 回归预测
y_pred_amount = np.zeros(len(X))
if self.regressor is not None and any(rain_mask):
y_pred_amount[rain_mask] = self.regressor.predict(X[rain_mask])
return {
'rain_probability': y_pred_proba,
'rain_binary': rain_mask.astype(int),
'rainfall_amount': y_pred_amount
}
def save_model(self, filename='rainfall_two_stage_model.joblib'):
"""
保存模型
"""
model_data = {
'binary_classifier': self.binary_classifier,
'regressor': self.regressor,
'threshold': self.threshold,
'clf_model_type': self.clf_model_type,
'reg_model_type': self.reg_model_type
}
joblib.dump(model_data, filename)
@classmethod
def load_model(cls, filename='rainfall_two_stage_model.joblib'):
"""
加载模型
"""
if not os.path.exists(filename):
raise FileNotFoundError(f"模型文件不存在: {filename}")
model_data = joblib.load(filename)
model = cls(
clf_model_type=model_data['clf_model_type'],
reg_model_type=model_data['reg_model_type']
)
model.binary_classifier = model_data['binary_classifier']
model.regressor = model_data['regressor']
model.threshold = model_data['threshold']
return model

View File

@ -0,0 +1,129 @@
import numpy as np
import pandas as pd
class FeatureEngineering:
@staticmethod
def calculate_time_features(df):
"""
计算时间相关特征
"""
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['day_of_week'] = df['datetime'].dt.dayofweek
return df
@staticmethod
def calculate_change_features(df, variables):
"""
计算变化率特征
"""
for var in variables:
# 1小时变化
df[f'{var}_1h_change'] = df[var].diff(1)
# 3小时变化
df[f'{var}_3h_change'] = df[var].diff(3)
# 6小时变化
df[f'{var}_6h_change'] = df[var].diff(6)
# 计算滚动均值
df[f'{var}_3h_mean'] = df[var].rolling(window=3, min_periods=1).mean()
df[f'{var}_6h_mean'] = df[var].rolling(window=6, min_periods=1).mean()
# 滚动标准差
df[f'{var}_6h_std'] = df[var].rolling(window=6, min_periods=1).std().fillna(0)
# 异常值
var_mean = df[var].mean()
var_std = df[var].std()
df[f'{var}_anomaly'] = (df[var] - var_mean) / var_std if var_std > 0 else 0
# 急剧变化指标
var_std = df[f'{var}_1h_change'].std()
if var_std > 0:
df[f'{var}_rapid_change'] = (np.abs(df[f'{var}_1h_change']) > (2 * var_std)).astype(int)
else:
df[f'{var}_rapid_change'] = 0
# 变化加速度
df[f'{var}_change_accel'] = df[f'{var}_1h_change'].diff(1).fillna(0)
return df
@staticmethod
def calculate_combined_features(df):
"""
计算组合特征
"""
# PWV和湿度相关特征
if all(var in df.columns for var in ['PWV', 'rh']):
df['PWV_rh'] = df['PWV'] * df['rh'] / 100
df['PWV_cumul_change'] = df['PWV'].diff(1).rolling(window=6).sum().fillna(0)
df['rh_cumul_change'] = df['rh'].diff(1).rolling(window=6).sum().fillna(0)
# PWV和温度相关特征
if all(var in df.columns for var in ['PWV', 'Temp']):
df['PWV_Temp'] = df['PWV'] * df['Temp']
temp_exp = np.exp(17.27 * (df['Temp']-273.15) / ((df['Temp']-273.15) + 237.3))
df['saturation_index'] = df['PWV'] / (0.1 * temp_exp)
# 气压相关特征
if all(var in df.columns for var in ['Press', 'Press_1h_change']):
df['Press_change_intensity'] = df['Press_1h_change'].abs()
df['Press_falling'] = (df['Press_1h_change'] < 0).astype(int)
press_quantile = df['Press_1h_change'].abs().quantile(0.75)
df['Press_rapid_fall'] = ((df['Press_1h_change'] < 0) &
(df['Press_1h_change'].abs() > press_quantile)).astype(int)
# ZTD和PWV相关特征
if all(var in df.columns for var in ['ZTD', 'PWV']):
df['ZTD_PWV_ratio'] = df['ZTD'] / df['PWV'].replace(0, np.nan).fillna(df['PWV'].mean())
if all(var in df.columns for var in ['ZTD_1h_change', 'PWV_1h_change']):
df['ZTD_PWV_change'] = df['ZTD_1h_change'] * df['PWV_1h_change']
pwv_mean = df['PWV_6h_mean'].mean()
if pwv_mean > 0:
df['moisture_instability'] = df['PWV_6h_std'] / df['PWV_6h_mean']
else:
df['moisture_instability'] = 0
# 降雨潜力指标
if all(var in df.columns for var in ['PWV', 'rh', 'Temp', 'Press_1h_change']):
df['rain_potential'] = ((df['PWV'] * df['rh'] / 100) *
(0.1 - df['Press_1h_change']) /
(df['Temp'] + 10))
pwv_mean = df['PWV'].mean() + 0.1
df['heavy_rain_potential'] = (df['PWV'] * (df['rh'] / 100) *
(df['Press_rapid_fall'] + 1) *
(1 + np.abs(df['PWV_1h_change']) /
pwv_mean))
df['rain_persistence'] = (df['PWV_cumul_change'] *
df['rh_cumul_change'] *
(1 + df['Press_change_intensity']))
return df
@staticmethod
def process_features(df):
"""
处理所有特征
"""
# 确保datetime列是datetime类型
if 'datetime' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['datetime']):
df['datetime'] = pd.to_datetime(df['datetime'])
# 计算时间特征
df = FeatureEngineering.calculate_time_features(df)
# 计算变化率特征
variables = ['PWV', 'ZTD', 'Temp', 'Press', 'rh']
df = FeatureEngineering.calculate_change_features(df, variables)
# 计算组合特征
df = FeatureEngineering.calculate_combined_features(df)
return df

113
src/utils/visualization.py Normal file
View File

@ -0,0 +1,113 @@
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
class Visualization:
def __init__(self, output_dir='figures'):
"""
初始化可视化工具
"""
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def plot_feature_importance(self, feature_importance, title='特征重要性'):
"""
绘制特征重要性图
"""
plt.figure(figsize=(12, 6))
feature_importance.sort_values().plot(kind='barh')
plt.title(title)
plt.xlabel('重要性')
plt.tight_layout()
plt.savefig(os.path.join(self.output_dir, 'feature_importance.png'))
plt.close()
def plot_rainfall_distribution(self, data, title='降雨量分布'):
"""
绘制降雨量分布图
"""
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='rainfall', bins=50)
plt.title(title)
plt.xlabel('降雨量 (mm)')
plt.ylabel('频次')
plt.savefig(os.path.join(self.output_dir, 'rainfall_distribution.png'))
plt.close()
def plot_time_series(self, data, variables, title='时间序列数据'):
"""
绘制时间序列图
"""
plt.figure(figsize=(15, 8))
for var in variables:
plt.plot(data['datetime'], data[var], label=var)
plt.title(title)
plt.xlabel('时间')
plt.ylabel('')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(self.output_dir, 'time_series.png'))
plt.close()
def plot_correlation_matrix(self, data, title='特征相关性矩阵'):
"""
绘制相关性矩阵热力图
"""
plt.figure(figsize=(12, 10))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title(title)
plt.tight_layout()
plt.savefig(os.path.join(self.output_dir, 'correlation_matrix.png'))
plt.close()
def plot_forecast_results(self, forecast_results, title='降雨预报结果'):
"""
绘制预报结果图
"""
time_points = sorted(forecast_results.keys())
rain_probs = [results['rain_probability'].mean() for results in forecast_results.values()]
rain_amounts = [results['rainfall_amount'].mean() for results in forecast_results.values()]
# 创建子图
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
# 绘制降雨概率
ax1.plot(time_points, rain_probs, 'b-', label='降雨概率')
ax1.fill_between(time_points, 0, rain_probs, alpha=0.2)
ax1.set_xlabel('预报时间(小时)')
ax1.set_ylabel('降雨概率')
ax1.set_title('未来降雨概率预报')
ax1.grid(True)
ax1.legend()
# 绘制降雨量
ax2.plot(time_points, rain_amounts, 'r-', label='预计降雨量')
ax2.fill_between(time_points, 0, rain_amounts, alpha=0.2)
ax2.set_xlabel('预报时间(小时)')
ax2.set_ylabel('降雨量 (mm)')
ax2.set_title('未来降雨量预报')
ax2.grid(True)
ax2.legend()
plt.tight_layout()
plt.savefig(os.path.join(self.output_dir, 'forecast_results.png'))
plt.close()
def plot_model_evaluation(self, y_true, y_pred, title='模型评估'):
"""
绘制模型评估图
"""
plt.figure(figsize=(10, 6))
plt.scatter(y_true, y_pred, alpha=0.5)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.title(title)
plt.grid(True)
plt.savefig(os.path.join(self.output_dir, 'model_evaluation.png'))
plt.close()