import tensorflow as tfimport timeimport subprocessimport threadingimport osdef setup_gpu():gpus = tf.config.list_physical_devices("GPU")if not gpus:raise RuntimeError("No GPU detected")for gpu in gpus:tf.config.experimental.set_memory_growth(gpu, True)print(f"GPUs detected: {len(gpus)}")def monitor_gpu(interval=5, duration=30):end_time = time.time() + durationwhile time.time() < end_time:try:# Get nvidia-smi output and extract only the essential lineresult = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu', '--format=csv,noheader,nounits'],capture_output=True, text=True, check=True)# Parse the output to get clean valuesgpu_util, mem_used, mem_total, temp = result.stdout.strip().split(', ')# Format the output in a single compact lineprint(f"GPU: {gpu_util}% | Mem: {mem_used}/{mem_total}MB | Temp: {temp}C")except Exception as e:print(f"Monitor error: {e}")time.sleep(interval)def run_gpu_stress(size=4096, duration=30):with tf.device("/GPU:0"):a = tf.random.normal([size, size], dtype=tf.float32)b = tf.random.normal([size, size], dtype=tf.float32)@tf.functiondef matmul_step():return tf.matmul(a, b)_ = matmul_step() # warm-upprint(f"Running GPU stress for {duration}s at full capacity")start = time.time()iters = 0while time.time() - start < duration:_ = matmul_step()iters += 1print(f"Completed {iters} iterations in {time.time() - start:.2f}s")if __name__ == "__main__":try:setup_gpu()monitor_thread = threading.Thread(target=monitor_gpu, args=(5, 30))monitor_thread.start()run_gpu_stress(size=4096, duration=30)monitor_thread.join()print("Test completed successfully")except Exception as e:print(f"Error: {e}")
文档反馈