1004 lines
29 KiB
Markdown
1004 lines
29 KiB
Markdown
|
|
# Performance Optimization Guide
|
||
|
|
|
||
|
|
This guide provides comprehensive strategies for optimizing NowYouSeeMe performance across all system components. Follow these guidelines to achieve the best possible performance for your specific use case.
|
||
|
|
|
||
|
|
## 🎯 Performance Targets
|
||
|
|
|
||
|
|
### Real-time Requirements
|
||
|
|
| Metric | Target | Acceptable Range | Critical |
|
||
|
|
|--------|--------|------------------|----------|
|
||
|
|
| **Latency** | <20ms | 15-25ms | >30ms |
|
||
|
|
| **Accuracy** | <10cm | 8-15cm | >20cm |
|
||
|
|
| **Frame Rate** | 30-60 FPS | 25-60 FPS | <20 FPS |
|
||
|
|
| **CSI Rate** | ≥100 pkt/s | 80-120 pkt/s | <50 pkt/s |
|
||
|
|
|
||
|
|
### Resource Utilization
|
||
|
|
| Component | CPU Target | GPU Target | Memory Target |
|
||
|
|
|-----------|------------|------------|---------------|
|
||
|
|
| **Camera Capture** | <10% | N/A | <500MB |
|
||
|
|
| **CSI Processing** | <15% | N/A | <1GB |
|
||
|
|
| **Vision SLAM** | <40% | <60% | <2GB |
|
||
|
|
| **RF SLAM** | <20% | N/A | <1GB |
|
||
|
|
| **Sensor Fusion** | <15% | <20% | <1GB |
|
||
|
|
| **Rendering** | <10% | <80% | <2GB |
|
||
|
|
|
||
|
|
## 🔧 Hardware Optimization
|
||
|
|
|
||
|
|
### GPU Configuration
|
||
|
|
|
||
|
|
#### NVIDIA GPU Setup
|
||
|
|
```bash
|
||
|
|
# Check GPU status
|
||
|
|
nvidia-smi
|
||
|
|
|
||
|
|
# Set GPU power management
|
||
|
|
sudo nvidia-smi -pm 1
|
||
|
|
|
||
|
|
# Set GPU memory allocation
|
||
|
|
export CUDA_VISIBLE_DEVICES=0
|
||
|
|
export CUDA_MEMORY_FRACTION=0.8
|
||
|
|
|
||
|
|
# Optimize GPU settings
|
||
|
|
nvidia-settings --assign GPUPowerMizerMode=1
|
||
|
|
```
|
||
|
|
|
||
|
|
#### GPU Memory Optimization
|
||
|
|
```python
|
||
|
|
# In your application
|
||
|
|
import torch
|
||
|
|
import cupy as cp
|
||
|
|
|
||
|
|
# Set memory fraction
|
||
|
|
torch.cuda.set_per_process_memory_fraction(0.8)
|
||
|
|
|
||
|
|
# Clear cache periodically
|
||
|
|
torch.cuda.empty_cache()
|
||
|
|
cp.get_default_memory_pool().free_all_blocks()
|
||
|
|
```
|
||
|
|
|
||
|
|
### CPU Optimization
|
||
|
|
|
||
|
|
#### Multi-threading Configuration
|
||
|
|
```python
|
||
|
|
# Configure thread pools
|
||
|
|
import multiprocessing as mp
|
||
|
|
|
||
|
|
# Set optimal thread count
|
||
|
|
optimal_threads = min(mp.cpu_count(), 8)
|
||
|
|
mp.set_start_method('spawn', force=True)
|
||
|
|
|
||
|
|
# Configure OpenMP
|
||
|
|
import os
|
||
|
|
os.environ['OMP_NUM_THREADS'] = str(optimal_threads)
|
||
|
|
os.environ['MKL_NUM_THREADS'] = str(optimal_threads)
|
||
|
|
```
|
||
|
|
|
||
|
|
#### CPU Affinity
|
||
|
|
```bash
|
||
|
|
# Set CPU affinity for critical processes
|
||
|
|
sudo taskset -cp 0-3 <process_id>
|
||
|
|
|
||
|
|
# Or in Python
|
||
|
|
import os
|
||
|
|
os.sched_setaffinity(0, {0, 1, 2, 3})
|
||
|
|
```
|
||
|
|
|
||
|
|
### Memory Optimization
|
||
|
|
|
||
|
|
#### Memory Management
|
||
|
|
```python
|
||
|
|
# Monitor memory usage
|
||
|
|
import psutil
|
||
|
|
import gc
|
||
|
|
|
||
|
|
def optimize_memory():
|
||
|
|
"""Optimize memory usage"""
|
||
|
|
# Force garbage collection
|
||
|
|
gc.collect()
|
||
|
|
|
||
|
|
# Clear caches
|
||
|
|
torch.cuda.empty_cache()
|
||
|
|
|
||
|
|
# Monitor memory
|
||
|
|
process = psutil.Process()
|
||
|
|
memory_mb = process.memory_info().rss / 1024 / 1024
|
||
|
|
print(f"Memory usage: {memory_mb:.1f} MB")
|
||
|
|
```
|
||
|
|
|
||
|
|
#### Memory Pooling
|
||
|
|
```python
|
||
|
|
# Use memory pools for frequent allocations
|
||
|
|
import numpy as np
|
||
|
|
from memory_profiler import profile
|
||
|
|
|
||
|
|
class MemoryPool:
|
||
|
|
def __init__(self, size=1000):
|
||
|
|
self.pool = []
|
||
|
|
self.size = size
|
||
|
|
|
||
|
|
def get_array(self, shape, dtype=np.float32):
|
||
|
|
if self.pool:
|
||
|
|
return self.pool.pop().reshape(shape)
|
||
|
|
return np.zeros(shape, dtype=dtype)
|
||
|
|
|
||
|
|
def return_array(self, array):
|
||
|
|
if len(self.pool) < self.size:
|
||
|
|
self.pool.append(array.flatten())
|
||
|
|
```
|
||
|
|
|
||
|
|
## 📊 Performance Monitoring
|
||
|
|
|
||
|
|
### Real-time Monitoring
|
||
|
|
```python
|
||
|
|
import time
|
||
|
|
import threading
|
||
|
|
from collections import deque
|
||
|
|
|
||
|
|
class PerformanceMonitor:
|
||
|
|
def __init__(self):
|
||
|
|
self.metrics = {
|
||
|
|
'latency': deque(maxlen=100),
|
||
|
|
'fps': deque(maxlen=100),
|
||
|
|
'accuracy': deque(maxlen=100),
|
||
|
|
'cpu_usage': deque(maxlen=100),
|
||
|
|
'gpu_usage': deque(maxlen=100),
|
||
|
|
'memory_usage': deque(maxlen=100)
|
||
|
|
}
|
||
|
|
self.running = False
|
||
|
|
self.monitor_thread = None
|
||
|
|
|
||
|
|
def start_monitoring(self):
|
||
|
|
"""Start performance monitoring"""
|
||
|
|
self.running = True
|
||
|
|
self.monitor_thread = threading.Thread(target=self._monitor_loop)
|
||
|
|
self.monitor_thread.start()
|
||
|
|
|
||
|
|
def stop_monitoring(self):
|
||
|
|
"""Stop performance monitoring"""
|
||
|
|
self.running = False
|
||
|
|
if self.monitor_thread:
|
||
|
|
self.monitor_thread.join()
|
||
|
|
|
||
|
|
def _monitor_loop(self):
|
||
|
|
"""Main monitoring loop"""
|
||
|
|
while self.running:
|
||
|
|
# Collect metrics
|
||
|
|
self._collect_metrics()
|
||
|
|
time.sleep(0.1) # 10Hz monitoring
|
||
|
|
|
||
|
|
def _collect_metrics(self):
|
||
|
|
"""Collect current performance metrics"""
|
||
|
|
# CPU usage
|
||
|
|
cpu_percent = psutil.cpu_percent()
|
||
|
|
self.metrics['cpu_usage'].append(cpu_percent)
|
||
|
|
|
||
|
|
# Memory usage
|
||
|
|
memory = psutil.virtual_memory()
|
||
|
|
self.metrics['memory_usage'].append(memory.percent)
|
||
|
|
|
||
|
|
# GPU usage (if available)
|
||
|
|
try:
|
||
|
|
import pynvml
|
||
|
|
pynvml.nvmlInit()
|
||
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
|
||
|
|
gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||
|
|
self.metrics['gpu_usage'].append(gpu_util.gpu)
|
||
|
|
except:
|
||
|
|
self.metrics['gpu_usage'].append(0)
|
||
|
|
|
||
|
|
def get_average_metrics(self):
|
||
|
|
"""Get average metrics over the last 100 samples"""
|
||
|
|
return {
|
||
|
|
metric: sum(values) / len(values) if values else 0
|
||
|
|
for metric, values in self.metrics.items()
|
||
|
|
}
|
||
|
|
|
||
|
|
def get_performance_report(self):
|
||
|
|
"""Generate performance report"""
|
||
|
|
avg_metrics = self.get_average_metrics()
|
||
|
|
|
||
|
|
report = {
|
||
|
|
'status': 'optimal' if self._check_targets(avg_metrics) else 'needs_optimization',
|
||
|
|
'metrics': avg_metrics,
|
||
|
|
'recommendations': self._generate_recommendations(avg_metrics)
|
||
|
|
}
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
def _check_targets(self, metrics):
|
||
|
|
"""Check if metrics meet targets"""
|
||
|
|
return (
|
||
|
|
metrics.get('latency', 0) < 20 and
|
||
|
|
metrics.get('fps', 0) > 30 and
|
||
|
|
metrics.get('accuracy', 0) < 10
|
||
|
|
)
|
||
|
|
|
||
|
|
def _generate_recommendations(self, metrics):
|
||
|
|
"""Generate optimization recommendations"""
|
||
|
|
recommendations = []
|
||
|
|
|
||
|
|
if metrics.get('latency', 0) > 20:
|
||
|
|
recommendations.append("High latency detected - consider reducing processing load")
|
||
|
|
|
||
|
|
if metrics.get('fps', 0) < 30:
|
||
|
|
recommendations.append("Low frame rate - check GPU utilization and rendering settings")
|
||
|
|
|
||
|
|
if metrics.get('cpu_usage', 0) > 80:
|
||
|
|
recommendations.append("High CPU usage - consider reducing thread count or processing quality")
|
||
|
|
|
||
|
|
if metrics.get('memory_usage', 0) > 80:
|
||
|
|
recommendations.append("High memory usage - consider clearing caches or reducing buffer sizes")
|
||
|
|
|
||
|
|
return recommendations
|
||
|
|
```
|
||
|
|
|
||
|
|
### Profiling Tools
|
||
|
|
|
||
|
|
#### CPU Profiling
|
||
|
|
```python
|
||
|
|
import cProfile
|
||
|
|
import pstats
|
||
|
|
import io
|
||
|
|
|
||
|
|
def profile_function(func, *args, **kwargs):
|
||
|
|
"""Profile a function's performance"""
|
||
|
|
pr = cProfile.Profile()
|
||
|
|
pr.enable()
|
||
|
|
|
||
|
|
result = func(*args, **kwargs)
|
||
|
|
|
||
|
|
pr.disable()
|
||
|
|
s = io.StringIO()
|
||
|
|
ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
|
||
|
|
ps.print_stats(20)
|
||
|
|
|
||
|
|
print(s.getvalue())
|
||
|
|
return result
|
||
|
|
```
|
||
|
|
|
||
|
|
#### Memory Profiling
|
||
|
|
```python
|
||
|
|
from memory_profiler import profile
|
||
|
|
|
||
|
|
@profile
|
||
|
|
def memory_intensive_function():
|
||
|
|
"""Function to profile memory usage"""
|
||
|
|
# Your memory-intensive code here
|
||
|
|
pass
|
||
|
|
```
|
||
|
|
|
||
|
|
#### GPU Profiling
|
||
|
|
```python
|
||
|
|
import torch
|
||
|
|
|
||
|
|
def profile_gpu_operations():
|
||
|
|
"""Profile GPU operations"""
|
||
|
|
with torch.profiler.profile(
|
||
|
|
activities=[
|
||
|
|
torch.profiler.ProfilerActivity.CPU,
|
||
|
|
torch.profiler.ProfilerActivity.CUDA,
|
||
|
|
],
|
||
|
|
record_shapes=True,
|
||
|
|
with_stack=True
|
||
|
|
) as prof:
|
||
|
|
# Your GPU operations here
|
||
|
|
pass
|
||
|
|
|
||
|
|
print(prof.key_averages().table(sort_by="cuda_time_total"))
|
||
|
|
```
|
||
|
|
|
||
|
|
## ⚡ Algorithm Optimization
|
||
|
|
|
||
|
|
### Vision SLAM Optimization
|
||
|
|
|
||
|
|
#### Feature Detection Optimization
|
||
|
|
```python
|
||
|
|
import cv2
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
class OptimizedFeatureDetector:
|
||
|
|
def __init__(self, max_features=1000, quality_level=0.01):
|
||
|
|
self.max_features = max_features
|
||
|
|
self.quality_level = quality_level
|
||
|
|
self.detector = cv2.FastFeatureDetector_create(
|
||
|
|
threshold=10,
|
||
|
|
nonmaxSuppression=True
|
||
|
|
)
|
||
|
|
|
||
|
|
def detect_features(self, image):
|
||
|
|
"""Optimized feature detection"""
|
||
|
|
# Convert to grayscale if needed
|
||
|
|
if len(image.shape) == 3:
|
||
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||
|
|
else:
|
||
|
|
gray = image
|
||
|
|
|
||
|
|
# Detect features
|
||
|
|
keypoints = self.detector.detect(gray)
|
||
|
|
|
||
|
|
# Limit number of features
|
||
|
|
if len(keypoints) > self.max_features:
|
||
|
|
keypoints = sorted(keypoints, key=lambda x: x.response, reverse=True)
|
||
|
|
keypoints = keypoints[:self.max_features]
|
||
|
|
|
||
|
|
return keypoints
|
||
|
|
```
|
||
|
|
|
||
|
|
#### Tracking Optimization
|
||
|
|
```python
|
||
|
|
class OptimizedTracker:
|
||
|
|
def __init__(self):
|
||
|
|
self.prev_frame = None
|
||
|
|
self.prev_keypoints = None
|
||
|
|
self.prev_descriptors = None
|
||
|
|
|
||
|
|
def track_features(self, frame, keypoints, descriptors):
|
||
|
|
"""Optimized feature tracking"""
|
||
|
|
if self.prev_frame is None:
|
||
|
|
self.prev_frame = frame
|
||
|
|
self.prev_keypoints = keypoints
|
||
|
|
self.prev_descriptors = descriptors
|
||
|
|
return keypoints, descriptors
|
||
|
|
|
||
|
|
# Use optical flow for fast tracking
|
||
|
|
if len(self.prev_keypoints) > 0:
|
||
|
|
prev_pts = np.float32([kp.pt for kp in self.prev_keypoints]).reshape(-1, 1, 2)
|
||
|
|
curr_pts, status, error = cv2.calcOpticalFlowPyrLK(
|
||
|
|
self.prev_frame, frame, prev_pts, None
|
||
|
|
)
|
||
|
|
|
||
|
|
# Filter good matches
|
||
|
|
good_old = self.prev_keypoints[status.ravel() == 1]
|
||
|
|
good_new = keypoints[status.ravel() == 1]
|
||
|
|
|
||
|
|
# Update tracking state
|
||
|
|
self.prev_frame = frame
|
||
|
|
self.prev_keypoints = good_new
|
||
|
|
self.prev_descriptors = descriptors[status.ravel() == 1]
|
||
|
|
|
||
|
|
return good_new, self.prev_descriptors
|
||
|
|
|
||
|
|
return keypoints, descriptors
|
||
|
|
```
|
||
|
|
|
||
|
|
### RF SLAM Optimization
|
||
|
|
|
||
|
|
#### CSI Processing Optimization
|
||
|
|
```python
|
||
|
|
import numpy as np
|
||
|
|
from scipy import signal
|
||
|
|
|
||
|
|
class OptimizedCSIProcessor:
|
||
|
|
def __init__(self, sample_rate=1000, window_size=64):
|
||
|
|
self.sample_rate = sample_rate
|
||
|
|
self.window_size = window_size
|
||
|
|
self.window = signal.windows.hann(window_size)
|
||
|
|
|
||
|
|
def process_csi_packet(self, csi_data):
|
||
|
|
"""Optimized CSI packet processing"""
|
||
|
|
# Apply window function
|
||
|
|
windowed_data = csi_data * self.window
|
||
|
|
|
||
|
|
# FFT with optimized size
|
||
|
|
fft_size = 2**int(np.log2(len(windowed_data)))
|
||
|
|
spectrum = np.fft.fft(windowed_data, fft_size)
|
||
|
|
|
||
|
|
# Extract relevant frequency bins
|
||
|
|
relevant_bins = spectrum[:fft_size//2]
|
||
|
|
|
||
|
|
return relevant_bins
|
||
|
|
|
||
|
|
def estimate_aoa(self, csi_packets):
|
||
|
|
"""Optimized AoA estimation"""
|
||
|
|
# Process multiple packets
|
||
|
|
processed_packets = [self.process_csi_packet(packet) for packet in csi_packets]
|
||
|
|
|
||
|
|
# Use MUSIC algorithm for AoA estimation
|
||
|
|
# (Simplified implementation)
|
||
|
|
correlation_matrix = np.corrcoef(processed_packets)
|
||
|
|
eigenvalues, eigenvectors = np.linalg.eigh(correlation_matrix)
|
||
|
|
|
||
|
|
# Estimate AoA from eigenstructure
|
||
|
|
noise_subspace = eigenvectors[:, :-3] # Assume 3 sources
|
||
|
|
aoa_spectrum = self._music_spectrum(noise_subspace)
|
||
|
|
|
||
|
|
return np.argmax(aoa_spectrum)
|
||
|
|
|
||
|
|
def _music_spectrum(self, noise_subspace):
|
||
|
|
"""MUSIC algorithm spectrum"""
|
||
|
|
# Simplified MUSIC implementation
|
||
|
|
angles = np.linspace(-np.pi/2, np.pi/2, 180)
|
||
|
|
spectrum = np.zeros(len(angles))
|
||
|
|
|
||
|
|
for i, angle in enumerate(angles):
|
||
|
|
steering_vector = np.exp(1j * 2 * np.pi * np.arange(4) * np.sin(angle))
|
||
|
|
spectrum[i] = 1 / (steering_vector.conj() @ noise_subspace @ noise_subspace.conj().T @ steering_vector)
|
||
|
|
|
||
|
|
return spectrum
|
||
|
|
```
|
||
|
|
|
||
|
|
### Sensor Fusion Optimization
|
||
|
|
|
||
|
|
#### EKF Optimization
|
||
|
|
```python
|
||
|
|
import numpy as np
|
||
|
|
from scipy.linalg import solve_discrete_lyapunov
|
||
|
|
|
||
|
|
class OptimizedEKF:
|
||
|
|
def __init__(self, state_dim=6, measurement_dim=3):
|
||
|
|
self.state_dim = state_dim
|
||
|
|
self.measurement_dim = measurement_dim
|
||
|
|
|
||
|
|
# Initialize state and covariance
|
||
|
|
self.x = np.zeros(state_dim)
|
||
|
|
self.P = np.eye(state_dim) * 0.1
|
||
|
|
|
||
|
|
# Process and measurement noise
|
||
|
|
self.Q = np.eye(state_dim) * 0.01
|
||
|
|
self.R = np.eye(measurement_dim) * 0.1
|
||
|
|
|
||
|
|
def predict(self, dt):
|
||
|
|
"""Optimized prediction step"""
|
||
|
|
# State transition matrix (constant velocity model)
|
||
|
|
F = np.eye(self.state_dim)
|
||
|
|
F[:3, 3:6] = np.eye(3) * dt
|
||
|
|
|
||
|
|
# Predict state
|
||
|
|
self.x = F @ self.x
|
||
|
|
|
||
|
|
# Predict covariance
|
||
|
|
self.P = F @ self.P @ F.T + self.Q
|
||
|
|
|
||
|
|
def update(self, measurement):
|
||
|
|
"""Optimized update step"""
|
||
|
|
# Measurement matrix
|
||
|
|
H = np.zeros((self.measurement_dim, self.state_dim))
|
||
|
|
H[:3, :3] = np.eye(3)
|
||
|
|
|
||
|
|
# Kalman gain
|
||
|
|
S = H @ self.P @ H.T + self.R
|
||
|
|
K = self.P @ H.T @ np.linalg.inv(S)
|
||
|
|
|
||
|
|
# Update state and covariance
|
||
|
|
y = measurement - H @ self.x
|
||
|
|
self.x = self.x + K @ y
|
||
|
|
self.P = (np.eye(self.state_dim) - K @ H) @ self.P
|
||
|
|
|
||
|
|
def get_pose(self):
|
||
|
|
"""Get current pose estimate"""
|
||
|
|
return {
|
||
|
|
'position': self.x[:3],
|
||
|
|
'velocity': self.x[3:6],
|
||
|
|
'covariance': self.P
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
## 🎨 Rendering Optimization
|
||
|
|
|
||
|
|
### OpenGL Optimization
|
||
|
|
```python
|
||
|
|
import OpenGL.GL as gl
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
class OptimizedRenderer:
|
||
|
|
def __init__(self):
|
||
|
|
self.shader_program = None
|
||
|
|
self.vao = None
|
||
|
|
self.vbo = None
|
||
|
|
self.ebo = None
|
||
|
|
|
||
|
|
self.setup_gl()
|
||
|
|
|
||
|
|
def setup_gl(self):
|
||
|
|
"""Setup OpenGL for optimal performance"""
|
||
|
|
# Enable optimizations
|
||
|
|
gl.glEnable(gl.GL_DEPTH_TEST)
|
||
|
|
gl.glEnable(gl.GL_CULL_FACE)
|
||
|
|
gl.glEnable(gl.GL_BLEND)
|
||
|
|
gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE_MINUS_SRC_ALPHA)
|
||
|
|
|
||
|
|
# Set clear color
|
||
|
|
gl.glClearColor(0.1, 0.1, 0.1, 1.0)
|
||
|
|
|
||
|
|
def create_shader_program(self, vertex_source, fragment_source):
|
||
|
|
"""Create optimized shader program"""
|
||
|
|
vertex_shader = gl.glCreateShader(gl.GL_VERTEX_SHADER)
|
||
|
|
gl.glShaderSource(vertex_shader, vertex_source)
|
||
|
|
gl.glCompileShader(vertex_shader)
|
||
|
|
|
||
|
|
fragment_shader = gl.glCreateShader(gl.GL_FRAGMENT_SHADER)
|
||
|
|
gl.glShaderSource(fragment_shader, fragment_source)
|
||
|
|
gl.glCompileShader(fragment_shader)
|
||
|
|
|
||
|
|
program = gl.glCreateProgram()
|
||
|
|
gl.glAttachShader(program, vertex_shader)
|
||
|
|
gl.glAttachShader(program, fragment_shader)
|
||
|
|
gl.glLinkProgram(program)
|
||
|
|
|
||
|
|
# Clean up shaders
|
||
|
|
gl.glDeleteShader(vertex_shader)
|
||
|
|
gl.glDeleteShader(fragment_shader)
|
||
|
|
|
||
|
|
return program
|
||
|
|
|
||
|
|
def setup_buffers(self, vertices, indices):
|
||
|
|
"""Setup optimized vertex buffers"""
|
||
|
|
# Create VAO
|
||
|
|
self.vao = gl.glGenVertexArrays(1)
|
||
|
|
gl.glBindVertexArray(self.vao)
|
||
|
|
|
||
|
|
# Create VBO
|
||
|
|
self.vbo = gl.glGenBuffers(1)
|
||
|
|
gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self.vbo)
|
||
|
|
gl.glBufferData(gl.GL_ARRAY_BUFFER, vertices.nbytes, vertices, gl.GL_STATIC_DRAW)
|
||
|
|
|
||
|
|
# Create EBO
|
||
|
|
self.ebo = gl.glGenBuffers(1)
|
||
|
|
gl.glBindBuffer(gl.GL_ELEMENT_ARRAY_BUFFER, self.ebo)
|
||
|
|
gl.glBufferData(gl.GL_ELEMENT_ARRAY_BUFFER, indices.nbytes, indices, gl.GL_STATIC_DRAW)
|
||
|
|
|
||
|
|
# Set vertex attributes
|
||
|
|
gl.glVertexAttribPointer(0, 3, gl.GL_FLOAT, gl.GL_FALSE, 24, None)
|
||
|
|
gl.glEnableVertexAttribArray(0)
|
||
|
|
|
||
|
|
gl.glVertexAttribPointer(1, 3, gl.GL_FLOAT, gl.GL_FALSE, 24, ctypes.c_void_p(12))
|
||
|
|
gl.glEnableVertexAttribArray(1)
|
||
|
|
|
||
|
|
def render_frame(self, pose_data):
|
||
|
|
"""Optimized frame rendering"""
|
||
|
|
# Clear buffers
|
||
|
|
gl.glClear(gl.GL_COLOR_BUFFER_BIT | gl.GL_DEPTH_BUFFER_BIT)
|
||
|
|
|
||
|
|
# Use shader program
|
||
|
|
gl.glUseProgram(self.shader_program)
|
||
|
|
|
||
|
|
# Update uniform matrices
|
||
|
|
self.update_matrices(pose_data)
|
||
|
|
|
||
|
|
# Bind VAO and draw
|
||
|
|
gl.glBindVertexArray(self.vao)
|
||
|
|
gl.glDrawElements(gl.GL_TRIANGLES, self.index_count, gl.GL_UNSIGNED_INT, None)
|
||
|
|
|
||
|
|
def update_matrices(self, pose_data):
|
||
|
|
"""Update transformation matrices"""
|
||
|
|
# Calculate view and projection matrices
|
||
|
|
view_matrix = self.calculate_view_matrix(pose_data)
|
||
|
|
projection_matrix = self.calculate_projection_matrix()
|
||
|
|
|
||
|
|
# Upload to GPU
|
||
|
|
gl.glUniformMatrix4fv(self.view_location, 1, gl.GL_FALSE, view_matrix)
|
||
|
|
gl.glUniformMatrix4fv(self.projection_location, 1, gl.GL_FALSE, projection_matrix)
|
||
|
|
```
|
||
|
|
|
||
|
|
### NeRF Rendering Optimization
|
||
|
|
```python
|
||
|
|
import torch
|
||
|
|
import torch.nn as nn
|
||
|
|
|
||
|
|
class OptimizedNeRFRenderer:
|
||
|
|
def __init__(self, model_path, device='cuda'):
|
||
|
|
self.device = device
|
||
|
|
self.model = self.load_model(model_path)
|
||
|
|
self.model.to(device)
|
||
|
|
self.model.eval()
|
||
|
|
|
||
|
|
# Optimization settings
|
||
|
|
self.chunk_size = 4096
|
||
|
|
self.num_samples = 64
|
||
|
|
|
||
|
|
def load_model(self, model_path):
|
||
|
|
"""Load optimized NeRF model"""
|
||
|
|
# Load pre-trained model
|
||
|
|
model = torch.load(model_path, map_location=self.device)
|
||
|
|
return model
|
||
|
|
|
||
|
|
@torch.no_grad()
|
||
|
|
def render_rays(self, rays_o, rays_d, near, far):
|
||
|
|
"""Optimized ray rendering"""
|
||
|
|
# Process rays in chunks
|
||
|
|
outputs = []
|
||
|
|
|
||
|
|
for i in range(0, rays_o.shape[0], self.chunk_size):
|
||
|
|
chunk_o = rays_o[i:i+self.chunk_size]
|
||
|
|
chunk_d = rays_d[i:i+self.chunk_size]
|
||
|
|
|
||
|
|
# Render chunk
|
||
|
|
chunk_output = self._render_chunk(chunk_o, chunk_d, near, far)
|
||
|
|
outputs.append(chunk_output)
|
||
|
|
|
||
|
|
# Combine outputs
|
||
|
|
return torch.cat(outputs, dim=0)
|
||
|
|
|
||
|
|
def _render_chunk(self, rays_o, rays_d, near, far):
|
||
|
|
"""Render a chunk of rays"""
|
||
|
|
# Sample points along rays
|
||
|
|
t_vals = torch.linspace(0., 1., self.num_samples, device=self.device)
|
||
|
|
z_vals = near * (1. - t_vals) + far * t_vals
|
||
|
|
|
||
|
|
# Expand dimensions
|
||
|
|
z_vals = z_vals.unsqueeze(0).expand(rays_o.shape[0], -1)
|
||
|
|
|
||
|
|
# Sample points
|
||
|
|
pts = rays_o.unsqueeze(1) + rays_d.unsqueeze(1) * z_vals.unsqueeze(-1)
|
||
|
|
|
||
|
|
# Query network
|
||
|
|
rgb, sigma = self.model(pts, rays_d)
|
||
|
|
|
||
|
|
# Volume rendering
|
||
|
|
rgb_final = self._volume_render(rgb, sigma, z_vals)
|
||
|
|
|
||
|
|
return rgb_final
|
||
|
|
|
||
|
|
def _volume_render(self, rgb, sigma, z_vals):
|
||
|
|
"""Volume rendering integration"""
|
||
|
|
# Calculate distances
|
||
|
|
dists = z_vals[..., 1:] - z_vals[..., :-1]
|
||
|
|
dists = torch.cat([dists, torch.tensor([1e10], device=self.device).expand(dists[..., :1].shape)], -1)
|
||
|
|
|
||
|
|
# Calculate alpha
|
||
|
|
alpha = 1. - torch.exp(-sigma * dists)
|
||
|
|
|
||
|
|
# Calculate weights
|
||
|
|
weights = alpha * torch.cumprod(torch.cat([torch.ones((alpha.shape[0], 1), device=self.device), 1.-alpha + 1e-10], -1), -1)[:, :-1]
|
||
|
|
|
||
|
|
# Integrate
|
||
|
|
rgb_final = torch.sum(weights.unsqueeze(-1) * rgb, -2)
|
||
|
|
|
||
|
|
return rgb_final
|
||
|
|
```
|
||
|
|
|
||
|
|
## 🔧 Configuration Optimization
|
||
|
|
|
||
|
|
### Performance Configuration
|
||
|
|
```json
|
||
|
|
{
|
||
|
|
"performance": {
|
||
|
|
"target_latency": 20,
|
||
|
|
"target_fps": 30,
|
||
|
|
"target_accuracy": 10,
|
||
|
|
"max_cpu_usage": 80,
|
||
|
|
"max_gpu_usage": 90,
|
||
|
|
"max_memory_usage": 80
|
||
|
|
},
|
||
|
|
"processing": {
|
||
|
|
"vision_slam": {
|
||
|
|
"max_features": 1000,
|
||
|
|
"min_features": 100,
|
||
|
|
"update_rate": 30,
|
||
|
|
"quality_level": 0.01
|
||
|
|
},
|
||
|
|
"rf_slam": {
|
||
|
|
"packet_rate": 100,
|
||
|
|
"aoa_estimation": "music",
|
||
|
|
"filter_window": 10
|
||
|
|
},
|
||
|
|
"sensor_fusion": {
|
||
|
|
"fusion_method": "ekf",
|
||
|
|
"vision_weight": 0.7,
|
||
|
|
"rf_weight": 0.3,
|
||
|
|
"process_noise": 0.01
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"rendering": {
|
||
|
|
"quality": "high",
|
||
|
|
"resolution": [1280, 720],
|
||
|
|
"vsync": true,
|
||
|
|
"antialiasing": true,
|
||
|
|
"shadow_quality": "medium"
|
||
|
|
},
|
||
|
|
"optimization": {
|
||
|
|
"use_gpu": true,
|
||
|
|
"use_multithreading": true,
|
||
|
|
"memory_pooling": true,
|
||
|
|
"chunk_processing": true
|
||
|
|
}
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
### Dynamic Configuration
|
||
|
|
```python
|
||
|
|
class DynamicConfigManager:
|
||
|
|
def __init__(self, base_config):
|
||
|
|
self.base_config = base_config
|
||
|
|
self.current_config = base_config.copy()
|
||
|
|
self.performance_monitor = PerformanceMonitor()
|
||
|
|
|
||
|
|
def optimize_config(self):
|
||
|
|
"""Dynamically optimize configuration based on performance"""
|
||
|
|
metrics = self.performance_monitor.get_average_metrics()
|
||
|
|
|
||
|
|
# Adjust based on latency
|
||
|
|
if metrics.get('latency', 0) > 25:
|
||
|
|
self._reduce_processing_load()
|
||
|
|
|
||
|
|
# Adjust based on frame rate
|
||
|
|
if metrics.get('fps', 0) < 25:
|
||
|
|
self._reduce_rendering_quality()
|
||
|
|
|
||
|
|
# Adjust based on CPU usage
|
||
|
|
if metrics.get('cpu_usage', 0) > 85:
|
||
|
|
self._reduce_thread_count()
|
||
|
|
|
||
|
|
# Adjust based on memory usage
|
||
|
|
if metrics.get('memory_usage', 0) > 85:
|
||
|
|
self._reduce_buffer_sizes()
|
||
|
|
|
||
|
|
def _reduce_processing_load(self):
|
||
|
|
"""Reduce processing load"""
|
||
|
|
self.current_config['processing']['vision_slam']['max_features'] = max(
|
||
|
|
500, self.current_config['processing']['vision_slam']['max_features'] - 100
|
||
|
|
)
|
||
|
|
self.current_config['processing']['vision_slam']['update_rate'] = max(
|
||
|
|
20, self.current_config['processing']['vision_slam']['update_rate'] - 5
|
||
|
|
)
|
||
|
|
|
||
|
|
def _reduce_rendering_quality(self):
|
||
|
|
"""Reduce rendering quality"""
|
||
|
|
quality_levels = ['high', 'medium', 'low']
|
||
|
|
current_quality = self.current_config['rendering']['quality']
|
||
|
|
current_index = quality_levels.index(current_quality)
|
||
|
|
|
||
|
|
if current_index < len(quality_levels) - 1:
|
||
|
|
self.current_config['rendering']['quality'] = quality_levels[current_index + 1]
|
||
|
|
|
||
|
|
def _reduce_thread_count(self):
|
||
|
|
"""Reduce thread count"""
|
||
|
|
# Implementation for reducing thread count
|
||
|
|
pass
|
||
|
|
|
||
|
|
def _reduce_buffer_sizes(self):
|
||
|
|
"""Reduce buffer sizes"""
|
||
|
|
# Implementation for reducing buffer sizes
|
||
|
|
pass
|
||
|
|
```
|
||
|
|
|
||
|
|
## 📊 Performance Testing
|
||
|
|
|
||
|
|
### Benchmark Suite
|
||
|
|
```python
|
||
|
|
import time
|
||
|
|
import statistics
|
||
|
|
|
||
|
|
class PerformanceBenchmark:
|
||
|
|
def __init__(self):
|
||
|
|
self.results = {}
|
||
|
|
|
||
|
|
def benchmark_latency(self, func, *args, iterations=100):
|
||
|
|
"""Benchmark function latency"""
|
||
|
|
times = []
|
||
|
|
|
||
|
|
for _ in range(iterations):
|
||
|
|
start_time = time.perf_counter()
|
||
|
|
func(*args)
|
||
|
|
end_time = time.perf_counter()
|
||
|
|
times.append((end_time - start_time) * 1000) # Convert to ms
|
||
|
|
|
||
|
|
return {
|
||
|
|
'mean': statistics.mean(times),
|
||
|
|
'median': statistics.median(times),
|
||
|
|
'std': statistics.stdev(times),
|
||
|
|
'min': min(times),
|
||
|
|
'max': max(times),
|
||
|
|
'p95': statistics.quantiles(times, n=20)[18], # 95th percentile
|
||
|
|
'p99': statistics.quantiles(times, n=100)[98] # 99th percentile
|
||
|
|
}
|
||
|
|
|
||
|
|
def benchmark_throughput(self, func, *args, duration=10):
|
||
|
|
"""Benchmark function throughput"""
|
||
|
|
start_time = time.perf_counter()
|
||
|
|
count = 0
|
||
|
|
|
||
|
|
while time.perf_counter() - start_time < duration:
|
||
|
|
func(*args)
|
||
|
|
count += 1
|
||
|
|
|
||
|
|
return count / duration # Operations per second
|
||
|
|
|
||
|
|
def benchmark_memory(self, func, *args):
|
||
|
|
"""Benchmark memory usage"""
|
||
|
|
import psutil
|
||
|
|
import gc
|
||
|
|
|
||
|
|
# Force garbage collection
|
||
|
|
gc.collect()
|
||
|
|
|
||
|
|
# Get initial memory
|
||
|
|
process = psutil.Process()
|
||
|
|
initial_memory = process.memory_info().rss
|
||
|
|
|
||
|
|
# Run function
|
||
|
|
func(*args)
|
||
|
|
|
||
|
|
# Get final memory
|
||
|
|
final_memory = process.memory_info().rss
|
||
|
|
|
||
|
|
return final_memory - initial_memory # Memory increase in bytes
|
||
|
|
|
||
|
|
def run_full_benchmark(self):
|
||
|
|
"""Run complete performance benchmark"""
|
||
|
|
benchmark_results = {}
|
||
|
|
|
||
|
|
# Benchmark camera capture
|
||
|
|
benchmark_results['camera_capture'] = self.benchmark_latency(
|
||
|
|
self.camera_capture_test
|
||
|
|
)
|
||
|
|
|
||
|
|
# Benchmark CSI processing
|
||
|
|
benchmark_results['csi_processing'] = self.benchmark_latency(
|
||
|
|
self.csi_processing_test
|
||
|
|
)
|
||
|
|
|
||
|
|
# Benchmark SLAM processing
|
||
|
|
benchmark_results['slam_processing'] = self.benchmark_latency(
|
||
|
|
self.slam_processing_test
|
||
|
|
)
|
||
|
|
|
||
|
|
# Benchmark rendering
|
||
|
|
benchmark_results['rendering'] = self.benchmark_latency(
|
||
|
|
self.rendering_test
|
||
|
|
)
|
||
|
|
|
||
|
|
# Benchmark end-to-end
|
||
|
|
benchmark_results['end_to_end'] = self.benchmark_latency(
|
||
|
|
self.end_to_end_test
|
||
|
|
)
|
||
|
|
|
||
|
|
return benchmark_results
|
||
|
|
|
||
|
|
def generate_report(self, results):
|
||
|
|
"""Generate performance report"""
|
||
|
|
report = {
|
||
|
|
'summary': {
|
||
|
|
'total_latency': sum(r['mean'] for r in results.values()),
|
||
|
|
'bottleneck': max(results.items(), key=lambda x: x[1]['mean'])[0],
|
||
|
|
'performance_grade': self._calculate_grade(results)
|
||
|
|
},
|
||
|
|
'details': results,
|
||
|
|
'recommendations': self._generate_recommendations(results)
|
||
|
|
}
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
def _calculate_grade(self, results):
|
||
|
|
"""Calculate overall performance grade"""
|
||
|
|
total_latency = sum(r['mean'] for r in results.values())
|
||
|
|
|
||
|
|
if total_latency < 20:
|
||
|
|
return 'A'
|
||
|
|
elif total_latency < 30:
|
||
|
|
return 'B'
|
||
|
|
elif total_latency < 40:
|
||
|
|
return 'C'
|
||
|
|
else:
|
||
|
|
return 'D'
|
||
|
|
|
||
|
|
def _generate_recommendations(self, results):
|
||
|
|
"""Generate optimization recommendations"""
|
||
|
|
recommendations = []
|
||
|
|
|
||
|
|
for component, metrics in results.items():
|
||
|
|
if metrics['mean'] > 10: # High latency threshold
|
||
|
|
recommendations.append(f"Optimize {component} - current latency: {metrics['mean']:.2f}ms")
|
||
|
|
|
||
|
|
if metrics['p99'] > metrics['mean'] * 2: # High variance
|
||
|
|
recommendations.append(f"Reduce variance in {component} - p99: {metrics['p99']:.2f}ms")
|
||
|
|
|
||
|
|
return recommendations
|
||
|
|
```
|
||
|
|
|
||
|
|
## 🚀 Deployment Optimization
|
||
|
|
|
||
|
|
### Production Configuration
|
||
|
|
```yaml
|
||
|
|
# docker-compose.prod.yml
|
||
|
|
version: '3.8'
|
||
|
|
|
||
|
|
services:
|
||
|
|
nowyouseeme:
|
||
|
|
build:
|
||
|
|
context: .
|
||
|
|
dockerfile: Dockerfile
|
||
|
|
target: production
|
||
|
|
container_name: nowyouseeme-prod
|
||
|
|
ports:
|
||
|
|
- "8080:8080"
|
||
|
|
volumes:
|
||
|
|
- ./config:/app/config:ro
|
||
|
|
- ./data:/app/data
|
||
|
|
- ./logs:/app/logs
|
||
|
|
environment:
|
||
|
|
- PYTHONPATH=/app/src
|
||
|
|
- NOWYOUSEE_DEBUG=0
|
||
|
|
- CUDA_VISIBLE_DEVICES=0
|
||
|
|
- OMP_NUM_THREADS=4
|
||
|
|
- MKL_NUM_THREADS=4
|
||
|
|
devices:
|
||
|
|
- /dev/video0:/dev/video0
|
||
|
|
- /dev/bus/usb:/dev/bus/usb
|
||
|
|
network_mode: host
|
||
|
|
restart: unless-stopped
|
||
|
|
deploy:
|
||
|
|
resources:
|
||
|
|
limits:
|
||
|
|
cpus: '4.0'
|
||
|
|
memory: 8G
|
||
|
|
reservations:
|
||
|
|
cpus: '2.0'
|
||
|
|
memory: 4G
|
||
|
|
healthcheck:
|
||
|
|
test: ["CMD", "python3", "-c", "import sys; sys.exit(0)"]
|
||
|
|
interval: 30s
|
||
|
|
timeout: 10s
|
||
|
|
retries: 3
|
||
|
|
start_period: 40s
|
||
|
|
```
|
||
|
|
|
||
|
|
### Monitoring Setup
|
||
|
|
```python
|
||
|
|
# monitoring.py
|
||
|
|
import prometheus_client
|
||
|
|
from prometheus_client import Counter, Histogram, Gauge
|
||
|
|
|
||
|
|
class PerformanceMetrics:
|
||
|
|
def __init__(self):
|
||
|
|
# Define metrics
|
||
|
|
self.latency_histogram = Histogram(
|
||
|
|
'nowyouseeme_latency_seconds',
|
||
|
|
'End-to-end latency in seconds',
|
||
|
|
buckets=[0.01, 0.02, 0.03, 0.05, 0.1, 0.2, 0.5]
|
||
|
|
)
|
||
|
|
|
||
|
|
self.fps_gauge = Gauge(
|
||
|
|
'nowyouseeme_fps',
|
||
|
|
'Current frame rate'
|
||
|
|
)
|
||
|
|
|
||
|
|
self.accuracy_gauge = Gauge(
|
||
|
|
'nowyouseeme_accuracy_cm',
|
||
|
|
'Current tracking accuracy in cm'
|
||
|
|
)
|
||
|
|
|
||
|
|
self.cpu_usage_gauge = Gauge(
|
||
|
|
'nowyouseeme_cpu_usage_percent',
|
||
|
|
'CPU usage percentage'
|
||
|
|
)
|
||
|
|
|
||
|
|
self.gpu_usage_gauge = Gauge(
|
||
|
|
'nowyouseeme_gpu_usage_percent',
|
||
|
|
'GPU usage percentage'
|
||
|
|
)
|
||
|
|
|
||
|
|
self.memory_usage_gauge = Gauge(
|
||
|
|
'nowyouseeme_memory_usage_percent',
|
||
|
|
'Memory usage percentage'
|
||
|
|
)
|
||
|
|
|
||
|
|
def record_latency(self, latency_ms):
|
||
|
|
"""Record latency measurement"""
|
||
|
|
self.latency_histogram.observe(latency_ms / 1000.0)
|
||
|
|
|
||
|
|
def record_fps(self, fps):
|
||
|
|
"""Record frame rate"""
|
||
|
|
self.fps_gauge.set(fps)
|
||
|
|
|
||
|
|
def record_accuracy(self, accuracy_cm):
|
||
|
|
"""Record accuracy measurement"""
|
||
|
|
self.accuracy_gauge.set(accuracy_cm)
|
||
|
|
|
||
|
|
def record_system_metrics(self, cpu_percent, gpu_percent, memory_percent):
|
||
|
|
"""Record system metrics"""
|
||
|
|
self.cpu_usage_gauge.set(cpu_percent)
|
||
|
|
self.gpu_usage_gauge.set(gpu_percent)
|
||
|
|
self.memory_usage_gauge.set(memory_percent)
|
||
|
|
|
||
|
|
# Start metrics server
|
||
|
|
if __name__ == '__main__':
|
||
|
|
prometheus_client.start_http_server(8000)
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
For more detailed optimization strategies, see:
|
||
|
|
- [Architecture Guide](architecture.md) - System design and optimization
|
||
|
|
- [Troubleshooting Guide](troubleshooting.md) - Performance issue resolution
|
||
|
|
- [API Reference](API_REFERENCE.md) - Performance-related API calls
|