应用领域拓展:机器学习、数据库与软件工程
应用领域拓展:机器学习、数据库与软件工程【免费下载链接】open-source-csVideo discussing this curriculum:项目地址: https://gitcode.com/gh_mirrors/...
应用领域拓展:机器学习、数据库与软件工程
文章概要介绍了斯坦福大学CS229机器学习课程的核心内容与实践指南,包括线性回归、逻辑回归、神经网络等关键算法的代码实现,以及机器学习工作流程和最佳实践。同时涵盖了数据库管理系统与SQL的深入学习,包括SQL语言体系、事务管理、性能优化策略和现代数据库发展趋势。此外,还详细阐述了软件工程原理与最佳实践,包括SOLID原则、代码审查流程、测试驱动开发、持续集成与持续部署,以及架构设计原则和DevOps文化。
斯坦福机器学习课程实战指南
斯坦福大学的CS229机器学习课程由人工智能领域的先驱Andrew Ng教授创立,是全球最负盛名的机器学习课程之一。这门课程不仅提供了坚实的理论基础,更重要的是教会学生如何将机器学习算法应用于实际问题解决。本指南将带你深入探索CS229课程的核心内容,并通过丰富的代码示例和实战项目,帮助你真正掌握机器学习的实践技能。
课程核心架构与学习路径
CS229课程采用系统化的教学方法,从基础数学概念到高级算法实现,构建了完整的学习体系:
关键算法实现与代码示例
1. 线性回归从零实现
线性回归是机器学习的基础,理解其数学原理和实现细节至关重要:
import numpy as np
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iterations):
y_pred = np.dot(X, self.weights) + self.bias
# 计算梯度
dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# 更新参数
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
# 记录损失
loss = np.mean((y_pred - y)**2)
self.loss_history.append(loss)
def predict(self, X):
return np.dot(X, self.weights) + self.bias
# 使用示例
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
y = np.dot(X, np.array([1, 2])) + 3
model = LinearRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X, y)
predictions = model.predict(X)
2. 逻辑回归与分类问题
逻辑回归是二分类问题的核心算法,以下是完整实现:
import numpy as np
class LogisticRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def _sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iterations):
linear_model = np.dot(X, self.weights) + self.bias
y_pred = self._sigmoid(linear_model)
# 计算梯度
dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# 更新参数
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
def predict(self, X, threshold=0.5):
linear_model = np.dot(X, self.weights) + self.bias
y_pred = self._sigmoid(linear_model)
return (y_pred >= threshold).astype(int)
# 乳腺癌数据集分类示例
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 训练模型
model = LogisticRegression(learning_rate=0.1, n_iterations=3000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
实战项目:手写数字识别
使用神经网络实现MNIST手写数字识别,这是CS229课程的经典项目:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
class NeuralNetwork:
def __init__(self, layer_sizes, learning_rate=0.1):
self.layer_sizes = layer_sizes
self.learning_rate = learning_rate
self.parameters = {}
self.initialize_parameters()
def initialize_parameters(self):
for l in range(1, len(self.layer_sizes)):
self.parameters[f'W{l}'] = np.random.randn(
self.layer_sizes[l], self.layer_sizes[l-1]) * 0.01
self.parameters[f'b{l}'] = np.zeros((self.layer_sizes[l], 1))
def relu(self, Z):
return np.maximum(0, Z)
def softmax(self, Z):
expZ = np.exp(Z - np.max(Z))
return expZ / expZ.sum(axis=0, keepdims=True)
def forward_propagation(self, X):
cache = {'A0': X}
L = len(self.parameters) // 2
for l in range(1, L):
Z = np.dot(self.parameters[f'W{l}'], cache[f'A{l-1}']) + self.parameters[f'b{l}']
cache[f'Z{l}'] = Z
cache[f'A{l}'] = self.relu(Z)
ZL = np.dot(self.parameters[f'W{L}'], cache[f'A{L-1}']) + self.parameters[f'b{L}']
cache[f'Z{L}'] = ZL
cache[f'A{L}'] = self.softmax(ZL)
return cache
def compute_cost(self, AL, Y):
m = Y.shape[1]
cost = -np.sum(Y * np.log(AL + 1e-8)) / m
return cost
def backward_propagation(self, cache, Y):
grads = {}
m = Y.shape[1]
L = len(self.parameters) // 2
dZL = cache[f'A{L}'] - Y
grads[f'dW{L}'] = np.dot(dZL, cache[f'A{L-1}'].T) / m
grads[f'db{L}'] = np.sum(dZL, axis=1, keepdims=True) / m
for l in reversed(range(1, L)):
dA_prev = np.dot(self.parameters[f'W{l+1}'].T, dZL)
dZ = dA_prev * (cache[f'Z{l}'] > 0).astype(float)
grads[f'dW{l}'] = np.dot(dZ, cache[f'A{l-1}'].T) / m
grads[f'db{l}'] = np.sum(dZ, axis=1, keepdims=True) / m
dZL = dZ
return grads
def update_parameters(self, grads):
L = len(self.parameters) // 2
for l in range(1, L+1):
self.parameters[f'W{l}'] -= self.learning_rate * grads[f'dW{l}']
self.parameters[f'b{l}'] -= self.learning_rate * grads[f'db{l}']
def train(self, X, Y, iterations):
for i in range(iterations):
cache = self.forward_propagation(X)
cost = self.compute_cost(cache[f'A{len(self.layer_sizes)-1}'], Y)
grads = self.backward_propagation(cache, Y)
self.update_parameters(grads)
if i % 100 == 0:
print(f"迭代 {i}, 损失: {cost}")
# 加载MNIST数据集
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data.T / 255.0, mnist.target.astype(int)
# 数据预处理
encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(y.reshape(-1, 1)).T
X_train, X_test, y_train, y_test = train_test_split(X.T, y, test_size=0.2, random_state=42)
# 创建神经网络
nn = NeuralNetwork([784, 128, 64, 10], learning_rate=0.1)
nn.train(X_train.T, encoder.transform(y_train.reshape(-1, 1)).T, 1000)
机器学习工作流程与最佳实践
CS229课程强调的系统化机器学习工作流程:
特征工程技术对比
技术类型 | 方法 | 适用场景 | 优点 | 缺点 |
---|---|---|---|---|
数值标准化 | StandardScaler | 基于距离的算法 | 消除量纲影响 | 对异常值敏感 |
归一化 | MinMaxScaler | 神经网络 | 数据范围统一 | 受极端值影响 |
独热编码 | OneHotEncoder | 分类变量 | 避免数值大小误解 | 维度灾难 |
特征选择 | SelectKBest | 高维数据 | 减少过拟合 | 可能丢失信息 |
PCA降维 | PCA | 数据可视化 | 去除相关性 | 可解释性降低 |
模型评估与超参数优化
掌握正确的模型评估方法是机器学习实践的关键:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
def evaluate_model(y_true, y_pred, model_name):
print(f"=== {model_name} 评估结果 ===")
print(classification_report(y_true, y_pred))
# 绘制混淆矩阵
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'{model_name} - 混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
return classification_report(y_true, y_pred, output_dict=True)
# 超参数优化示例
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)
实际应用案例:房价预测系统
结合CS229课程知识,构建完整的房价预测机器学习系统:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import joblib
class HousePricePredictor:
def __init__(self):
self.model = GradientBoostingRegressor(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42
)
self.scaler = StandardScaler()
self.is_trained = False
def load_and_preprocess_data(self):
"""加载和预处理加州房价数据集"""
housing = fetch_california_housing()
X, y = housing.data, housing.target
# 添加多项式特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# 数据标准化
X_scaled = self.scaler.fit_transform(X_poly)
return X_scaled, y, housing.feature_names
def train_model(self, X, y):
"""训练模型并进行交叉验证"""
# 交叉验证
cv_scores = cross_val_score(self.model, X, y, cv=5, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")
# 训练最终模型
self.model.fit(X, y)
self.is_trained = True
return self.model
def evaluate_model(self, X_test, y_test):
"""评估模型性能"""
if not self.is_trained:
raise ValueError("模型尚未训练")
y_pred = self.model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"测试集MSE: {mse:.3f}")
print(f"测试集R²: {r2:.3f}")
return mse, r2
def predict_price(self, features):
"""预测房价"""
if not self.is_trained:
raise ValueError("模型尚未训练")
# 预处理输入特征
features = np.array(features).reshape(1, -1)
features_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(features)
features_scaled = self.scaler.transform(features_poly)
prediction = self.model.predict(features_scaled)
return prediction[0]
def save_model(self, filepath):
"""保存训练好的模型"""
joblib.dump({
'model': self.model,
'scaler': self.scaler
}, filepath)
def load_model(self, filepath):
"""加载已训练的模型"""
saved_data = joblib.load(filepath)
self.model = saved_data['model']
self.scaler = saved_data['scaler']
self.is_trained = True
# 使用示例
predictor = HousePricePredictor()
X, y, feature_names = predictor.load_and_preprocess_data()
# 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练模型
predictor.train_model(X_train, y_train)
# 评估模型
predictor.evaluate_model(X_test, y_test)
# 进行预测
sample_house = [8.3252, 41.0, 6.984127, 1.023810, 322.0, 2.555556, 37.88, -122.23]
predicted_price = predictor.predict_price(sample_house)
print(f"预测房价: ${predicted_price * 100000:.2f}")
通过这个完整的实战指南,你不仅能够理解斯坦福CS229课程的理论精髓,更重要的是掌握了将机器学习算法应用于实际问题的完整技能栈。从基础的线性回归到复杂的神经网络,从模型训练到部署应用,每一个环节都配备了详细的代码示例和最佳实践建议。
数据库管理系统与SQL深入学习
在现代数据驱动的世界中,数据库管理系统(DBMS)和SQL语言构成了数据存储、管理和分析的核心技术栈。作为计算机科学教育的重要组成部分,深入理解数据库系统不仅涉及基础的CRUD操作,更需要掌握高级的SQL特性、事务管理、性能优化以及系统架构设计。
数据库系统架构与核心组件
数据库管理系统是一个复杂的软件系统,由多个协同工作的组件构成,每个组件都承担着特定的职责:
数据库系统的核心组件包括:
- 硬件层:物理存储设备、服务器和网络基础设施
- 软件层:数据库引擎、操作系统接口和网络通信模块
- 数据层:结构化数据存储和元数据管理系统
- 过程层:操作流程、备份恢复和安全策略
更多推荐
所有评论(0)