Projet Capstone : Application ML Full-Stack
Guide étape par étape pour créer une application ML complète avec Streamlit, FastAPI, H2O et MLOps.
I
InSkillCoach
· min
Projet Capstone : Application ML Full-Stack
Guide complet pour créer une application de prédiction de prix immobiliers avec l’écosystème ML moderne.
1. Configuration Initiale
1.1 Structure du Projet
real-estate-ml/
├── data/
│ ├── raw/
│ │ └── housing_data.csv
│ └── processed/
├── models/
│ ├── trained/
│ └── deployed/
├── src/
│ ├── data/
│ │ ├── __init__.py
│ │ ├── prepare.py
│ │ └── features.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── train.py
│ │ └── evaluate.py
│ ├── api/
│ │ ├── __init__.py
│ │ └── main.py
│ └── frontend/
│ ├── __init__.py
│ └── app.py
├── tests/
├── notebooks/
├── configs/
│ └── config.yaml
├── requirements.txt
├── Dockerfile
├── docker-compose.yml
└── README.md
1.2 Installation des Dépendances
# requirements.txt
streamlit==1.24.0
fastapi==0.95.0
uvicorn==0.21.1
h2o==3.40.0
pandas==1.5.3
numpy==1.24.2
scikit-learn==1.2.2
mlflow==2.3.1
dvc==3.15.1
pytest==7.3.1
python-dotenv==1.0.0
2. Préparation des Données
2.1 Script de Préparation
# src/data/prepare.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import yaml
def load_config():
with open("configs/config.yaml", "r") as f:
return yaml.safe_load(f)
def prepare_data():
config = load_config()
# Chargement des données
data = pd.read_csv("data/raw/housing_data.csv")
# Nettoyage
data = clean_data(data)
# Feature engineering
data = engineer_features(data)
# Split train/test
train, test = train_test_split(
data,
test_size=config["data"]["test_size"],
random_state=config["data"]["random_state"]
)
# Sauvegarde
train.to_csv("data/processed/train.csv", index=False)
test.to_csv("data/processed/test.csv", index=False)
return train, test
def clean_data(data):
# Suppression des doublons
data = data.drop_duplicates()
# Gestion des valeurs manquantes
data = data.fillna({
"price": data["price"].mean(),
"bedrooms": data["bedrooms"].median(),
"bathrooms": data["bathrooms"].median()
})
return data
2.2 Feature Engineering
# src/data/features.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
def engineer_features(data):
# Création de nouvelles features
data["price_per_sqft"] = data["price"] / data["sqft_living"]
data["total_rooms"] = data["bedrooms"] + data["bathrooms"]
# Encodage des variables catégorielles
data = pd.get_dummies(data, columns=["zipcode"])
# Normalisation
scaler = StandardScaler()
numeric_columns = ["price", "sqft_living", "bedrooms", "bathrooms"]
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
return data
3. Modèle ML avec H2O
3.1 Configuration du Modèle
# src/models/train.py
import h2o
from h2o.automl import H2OAutoML
import mlflow
from mlflow.tracking import MlflowClient
def train_model(train_data, test_data):
# Initialisation H2O
h2o.init()
# Conversion en H2OFrame
train = h2o.H2OFrame(train_data)
test = h2o.H2OFrame(test_data)
# Configuration AutoML
aml = H2OAutoML(
max_models=10,
seed=42,
max_runtime_secs=300,
sort_metric="RMSE",
balance_classes=False,
nfolds=5
)
# Configuration MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("real_estate_prediction")
with mlflow.start_run():
# Entraînement
aml.train(x=train.columns, y="price", training_frame=train)
# Évaluation
performance = aml.leader.model_performance(test)
# Logging
mlflow.log_params({
"max_models": 10,
"max_runtime_secs": 300,
"nfolds": 5
})
mlflow.log_metrics({
"rmse": performance.rmse(),
"mae": performance.mae(),
"r2": performance.r2()
})
# Sauvegarde du modèle
h2o.save_model(aml.leader, path="models/trained/")
mlflow.log_artifact("models/trained/")
return aml.leader
3.2 Évaluation du Modèle
# src/models/evaluate.py
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
def evaluate_model(model, test_data):
# Prédictions
predictions = model.predict(test_data)
# Métriques
metrics = {
"rmse": np.sqrt(mean_squared_error(test_data["price"], predictions)),
"r2": r2_score(test_data["price"], predictions),
"mae": np.mean(np.abs(test_data["price"] - predictions))
}
return metrics
4. API FastAPI
4.1 Configuration de l’API
# src/api/main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import h2o
import numpy as np
from typing import List, Optional
app = FastAPI(
title="Real Estate Price Predictor",
description="API pour prédire les prix immobiliers",
version="1.0.0"
)
# Modèles de données
class HouseFeatures(BaseModel):
sqft_living: float
bedrooms: int
bathrooms: float
zipcode: str
year_built: Optional[int] = None
class PredictionResponse(BaseModel):
predicted_price: float
confidence: float
model_version: str
# Chargement du modèle
model = h2o.load_model("models/trained/real_estate_model")
@app.post("/predict", response_model=PredictionResponse)
async def predict(features: HouseFeatures):
try:
# Préparation des données
input_data = prepare_input(features)
# Prédiction
prediction = model.predict(input_data)
return PredictionResponse(
predicted_price=float(prediction[0]),
confidence=0.95, # À adapter selon le modèle
model_version="1.0.0"
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
5. Interface Streamlit
5.1 Application Frontend
# src/frontend/app.py
import streamlit as st
import requests
import pandas as pd
import plotly.express as px
def main():
st.set_page_config(
page_title="Prédiction de Prix Immobilier",
page_icon="🏠",
layout="wide"
)
st.title("Prédiction de Prix Immobilier")
# Sidebar pour les paramètres
with st.sidebar:
st.header("Paramètres du Modèle")
model_type = st.selectbox(
"Type de Modèle",
["AutoML", "Random Forest", "Gradient Boosting"]
)
# Formulaire de saisie
with st.form("prediction_form"):
col1, col2 = st.columns(2)
with col1:
sqft_living = st.number_input("Surface (m²)", min_value=0)
bedrooms = st.number_input("Chambres", min_value=0)
with col2:
bathrooms = st.number_input("Salles de bain", min_value=0)
zipcode = st.text_input("Code postal")
submitted = st.form_submit_button("Prédire")
if submitted:
# Appel API
response = requests.post(
"http://localhost:8000/predict",
json={
"sqft_living": sqft_living,
"bedrooms": bedrooms,
"bathrooms": bathrooms,
"zipcode": zipcode
}
)
if response.status_code == 200:
result = response.json()
st.success(f"Prix prédit : {result['predicted_price']:,.2f} €")
# Visualisation
show_visualizations(result)
else:
st.error("Erreur lors de la prédiction")
6. Configuration MLOps
6.1 Pipeline DVC
# dvc.yaml
stages:
prepare:
cmd: python src/data/prepare.py
deps:
- data/raw/housing_data.csv
outs:
- data/processed/train.csv
- data/processed/test.csv
train:
cmd: python src/models/train.py
deps:
- data/processed/train.csv
- data/processed/test.csv
outs:
- models/trained/
evaluate:
cmd: python src/models/evaluate.py
deps:
- models/trained/
- data/processed/test.csv
metrics:
- metrics.json:
cache: false
6.2 Configuration Docker
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
EXPOSE 8501
EXPOSE 8000
CMD ["streamlit", "run", "src/frontend/app.py"]
# docker-compose.yml
version: '3.8'
services:
frontend:
build: .
ports:
- "8501:8501"
depends_on:
- backend
backend:
build: .
ports:
- "8000:8000"
command: uvicorn src.api.main:app --host 0.0.0.0 --port 8000
depends_on:
- mlflow
mlflow:
image: ghcr.io/mlflow/mlflow:latest
ports:
- "5000:5000"
volumes:
- ./mlruns:/mlruns
7. Déploiement et Monitoring
7.1 GitHub Actions
# .github/workflows/deploy.yml
name: Deploy ML Application
on:
push:
branches: [ main ]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.9'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Train model
run: dvc repro train
- name: Build and push Docker images
run: |
docker build -t real-estate-ml .
docker push real-estate-ml
- name: Deploy to production
run: |
kubectl apply -f k8s/
7.2 Monitoring
# src/monitoring/metrics.py
from prometheus_client import Counter, Histogram
import time
# Métriques
PREDICTIONS = Counter('model_predictions_total', 'Total des prédictions')
LATENCY = Histogram('model_latency_seconds', 'Latence des prédictions')
ERRORS = Counter('model_errors_total', 'Total des erreurs')
def track_prediction(latency):
PREDICTIONS.inc()
LATENCY.observe(latency)
def track_error():
ERRORS.inc()
8. Tests
8.1 Tests Unitaires
# tests/test_api.py
import pytest
from fastapi.testclient import TestClient
from src.api.main import app
client = TestClient(app)
def test_predict():
response = client.post(
"/predict",
json={
"sqft_living": 1000,
"bedrooms": 3,
"bathrooms": 2,
"zipcode": "75001"
}
)
assert response.status_code == 200
assert "predicted_price" in response.json()
9. Exécution du Projet
9.1 Instructions de Démarrage
# 1. Cloner le projet
git clone https://github.com/votre-username/real-estate-ml.git
cd real-estate-ml
# 2. Créer l'environnement virtuel
python -m venv venv
source venv/bin/activate # Linux/Mac
venv\Scripts\activate # Windows
# 3. Installer les dépendances
pip install -r requirements.txt
# 4. Préparer les données
dvc pull
dvc repro prepare
# 5. Entraîner le modèle
dvc repro train
# 6. Lancer l'application
docker-compose up
9.2 Accès aux Services
- Frontend Streamlit : http://localhost:8501
- API FastAPI : http://localhost:8000
- Documentation API : http://localhost:8000/docs
- MLflow : http://localhost:5000
Conclusion
Points clés du projet :
- Architecture complète ML
- Pipeline automatisé
- Interface utilisateur intuitive
- Monitoring efficace
Recommandations pour l’amélioration :
- Ajouter plus de tests
- Optimiser les performances
- Améliorer la documentation
- Ajouter des visualisations avancées
- Implémenter l’authentification
À propos de InSkillCoach
Expert en formation et technologies
Coach spécialisé dans les technologies avancées et l'IA, porté par GNeurone Inc.
Certifications:
- AWS Certified Solutions Architect – Professional
- Certifications Google Cloud
- Microsoft Certified: DevOps Engineer Expert
- Certified Kubernetes Administrator (CKA)
- CompTIA Security+
959
283
Commentaires
Les commentaires sont alimentés par GitHub Discussions
Connectez-vous avec GitHub pour participer à la discussion