Spaces:
Build error
Build error
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import torch | |
| import streamlit as st | |
| from pyinstrument import Profiler | |
| from typing import Dict | |
| import pandas as pd | |
| def init_gpu_memory(): | |
| """ | |
| When CUDA is initialized, it occupies some memory on the GPU thus this overhead | |
| can sometimes make it difficult to understand how much memory is actually used by | |
| the model. | |
| This function is used to initialize CUDA and measure the overhead. | |
| """ | |
| if not torch.cuda.is_available(): | |
| return {} | |
| # lets init torch gpu for a moment | |
| gpu_memory_overhead = {} | |
| for i in range(torch.cuda.device_count()): | |
| torch.ones(1).cuda(i) | |
| free, total = torch.cuda.mem_get_info(i) | |
| occupied = total - free | |
| gpu_memory_overhead[i] = occupied | |
| return gpu_memory_overhead | |
| class SystemMonitor: | |
| """ | |
| This class is used to monitor the system resources such as GPU memory and CPU | |
| usage. It uses the pyinstrument library to profile the code and measure the | |
| execution time of different parts of the code. | |
| """ | |
| def __init__( | |
| self, | |
| enabled: bool = False, | |
| ): | |
| self.enabled = enabled | |
| self.profiler = Profiler() | |
| self.overhead: Dict[int, int] | |
| def __enter__(self): | |
| if not self.enabled: | |
| return | |
| self.overhead = init_gpu_memory() | |
| self.profiler.__enter__() | |
| def __exit__(self, exc_type, exc_value, traceback): | |
| if not self.enabled: | |
| return | |
| self.profiler.__exit__(exc_type, exc_value, traceback) | |
| self.report_gpu_usage() | |
| self.report_profiler() | |
| with st.expander("Session state"): | |
| st.write(st.session_state) | |
| return None | |
| def report_gpu_usage(self): | |
| if not torch.cuda.is_available(): | |
| return | |
| data = [] | |
| for i in range(torch.cuda.device_count()): | |
| free, total = torch.cuda.mem_get_info(i) | |
| occupied = total - free | |
| data.append({ | |
| 'overhead': self.overhead[i], | |
| 'occupied': occupied - self.overhead[i], | |
| 'free': free, | |
| }) | |
| df = pd.DataFrame(data, columns=["overhead", "occupied", "free"]) | |
| with st.sidebar.expander("System"): | |
| st.write("GPU memory on server") | |
| df /= 1024 ** 3 # Convert to GB | |
| st.bar_chart(df, width=200, height=200, color=["#fefefe", "#84c9ff", "#fe2b2b"]) | |
| def report_profiler(self): | |
| html_code = self.profiler.output_html() | |
| with st.expander("Profiler", expanded=False): | |
| st.components.v1.html(html_code, height=1000, scrolling=True) | |