runtimes

`BaseRuntime` #

Abstract base class for runtime implementations.

This class defines the interface that all runtime implementations must follow. It provides methods for model initialization, inference, and performance benchmarking.

Attributes:

Name	Type	Description
`model_path`	`str`	Path to the model file.
`opts`	`Any`	Runtime-specific options.
`model_info`	`RemoteModelInfo`	Metadata about the model.

Source code in focoos/infer/runtimes/base.py

class BaseRuntime:
    """
    Abstract base class for runtime implementations.

    This class defines the interface that all runtime implementations must follow.
    It provides methods for model initialization, inference, and performance benchmarking.

    Attributes:
        model_path (str): Path to the model file.
        opts (Any): Runtime-specific options.
        model_info (RemoteModelInfo): Metadata about the model.
    """

    def __init__(self, model_path: str, opts: Any, model_info: RemoteModelInfo):
        """
        Initialize the runtime with model path, options and metadata.

        Args:
            model_path (str): Path to the model file.
            opts (Any): Runtime-specific configuration options.
            model_info (RemoteModelInfo): Metadata about the model.
        """
        pass

    @abstractmethod
    def __call__(self, im: torch.Tensor) -> list[np.ndarray]:
        """
        Run inference on the input image.

        Args:
            im (np.ndarray): Input image as a numpy array.

        Returns:
            np.ndarray: Model output as a numpy array.
        """
        pass

    @abstractmethod
    def _get_device_info(self) -> tuple[str, str]:
        """
        Get the engine and device name.
        """
        pass

    @abstractmethod
    def benchmark(self, iterations: int, size: Union[int, Tuple[int, int]]) -> LatencyMetrics:
        """
        Benchmark the model performance.

        Args:
            iterations (int): Number of inference iterations to run.
            size (float): Input image size for benchmarking.

        Returns:
            LatencyMetrics: Performance metrics including mean, median, and percentile latencies.
        """
        pass

`call(im)` `abstractmethod` #

Run inference on the input image.

Parameters:

Name	Type	Description	Default
`im`	`ndarray`	Input image as a numpy array.	required

Returns:

Type	Description
`list[ndarray]`	np.ndarray: Model output as a numpy array.

Source code in focoos/infer/runtimes/base.py

@abstractmethod
def __call__(self, im: torch.Tensor) -> list[np.ndarray]:
    """
    Run inference on the input image.

    Args:
        im (np.ndarray): Input image as a numpy array.

    Returns:
        np.ndarray: Model output as a numpy array.
    """
    pass

`init(model_path, opts, model_info)` #

Initialize the runtime with model path, options and metadata.

Parameters:

Name	Type	Description	Default
`model_path`	`str`	Path to the model file.	required
`opts`	`Any`	Runtime-specific configuration options.	required
`model_info`	`RemoteModelInfo`	Metadata about the model.	required

Source code in focoos/infer/runtimes/base.py

def __init__(self, model_path: str, opts: Any, model_info: RemoteModelInfo):
    """
    Initialize the runtime with model path, options and metadata.

    Args:
        model_path (str): Path to the model file.
        opts (Any): Runtime-specific configuration options.
        model_info (RemoteModelInfo): Metadata about the model.
    """
    pass

`benchmark(iterations, size)` `abstractmethod` #

Benchmark the model performance.

Parameters:

Name	Type	Description	Default
`iterations`	`int`	Number of inference iterations to run.	required
`size`	`float`	Input image size for benchmarking.	required

Returns:

Name	Type	Description
`LatencyMetrics`	`LatencyMetrics`	Performance metrics including mean, median, and percentile latencies.

Source code in focoos/infer/runtimes/base.py

@abstractmethod
def benchmark(self, iterations: int, size: Union[int, Tuple[int, int]]) -> LatencyMetrics:
    """
    Benchmark the model performance.

    Args:
        iterations (int): Number of inference iterations to run.
        size (float): Input image size for benchmarking.

    Returns:
        LatencyMetrics: Performance metrics including mean, median, and percentile latencies.
    """
    pass

`load_runtime(runtime_type, model_path, model_info, warmup_iter=50, device='auto')` #

Creates and returns a runtime instance based on the specified runtime type. Supports both ONNX and TorchScript runtimes with various execution providers.

Parameters:

Name	Type	Description	Default
`runtime_type`	`RuntimeTypes`	The type of runtime to use. Can be one of: - ONNX_CUDA32: ONNX runtime with CUDA FP32 - ONNX_TRT32: ONNX runtime with TensorRT FP32 - ONNX_TRT16: ONNX runtime with TensorRT FP16 - ONNX_CPU: ONNX runtime with CPU - ONNX_COREML: ONNX runtime with CoreML - TORCHSCRIPT_32: TorchScript runtime with FP32	required
`model_path`	`str`	Path to the model file (.onnx or .pt)	required
`model_metadata`	`ModelMetadata`	Model metadata containing task type, classes etc.	required
`warmup_iter`	`int`	Number of warmup iterations before inference. Defaults to 0.	`50`

Returns:

Name	Type	Description
`BaseRuntime`	`BaseRuntime`	A configured runtime instance (ONNXRuntime or TorchscriptRuntime)

Raises:

Type	Description
`ImportError`	If required dependencies (torch/onnxruntime) are not installed

Source code in focoos/infer/runtimes/load_runtime.py

def load_runtime(
    runtime_type: RuntimeType,
    model_path: str,
    model_info: ModelInfo,
    warmup_iter: int = 50,
    device: Literal["cuda", "cpu", "auto"] = "auto",
) -> BaseRuntime:
    """
    Creates and returns a runtime instance based on the specified runtime type.
    Supports both ONNX and TorchScript runtimes with various execution providers.

    Args:
        runtime_type (RuntimeTypes): The type of runtime to use. Can be one of:
            - ONNX_CUDA32: ONNX runtime with CUDA FP32
            - ONNX_TRT32: ONNX runtime with TensorRT FP32
            - ONNX_TRT16: ONNX runtime with TensorRT FP16
            - ONNX_CPU: ONNX runtime with CPU
            - ONNX_COREML: ONNX runtime with CoreML
            - TORCHSCRIPT_32: TorchScript runtime with FP32
        model_path (str): Path to the model file (.onnx or .pt)
        model_metadata (ModelMetadata): Model metadata containing task type, classes etc.
        warmup_iter (int, optional): Number of warmup iterations before inference. Defaults to 0.

    Returns:
        BaseRuntime: A configured runtime instance (ONNXRuntime or TorchscriptRuntime)

    Raises:
        ImportError: If required dependencies (torch/onnxruntime) are not installed
    """
    if runtime_type == RuntimeType.TORCHSCRIPT_32:
        if not TORCH_AVAILABLE:
            logger.error(
                "⚠️ Pytorch not found =(  please install focoos with ['torch'] extra. See https://focoosai.github.io/focoos/setup/ for more details"
            )
            raise ImportError("Pytorch not found")
        from focoos.infer.runtimes.torchscript import TorchscriptRuntime

        opts = TorchscriptRuntimeOpts(warmup_iter=warmup_iter)
        return TorchscriptRuntime(model_path=model_path, opts=opts, model_info=model_info, device=device)
    else:
        if not ORT_AVAILABLE:
            logger.error(
                "⚠️ onnxruntime not found =(  please install focoos with one of 'onnx', 'onnx-cpu', extra. See https://focoosai.github.io/focoos/setup/ for more details"
            )
            raise ImportError("onnxruntime not found")
        from focoos.infer.runtimes.onnx import ONNXRuntime

        opts = OnnxRuntimeOpts(
            cuda=runtime_type == RuntimeType.ONNX_CUDA32,
            trt=runtime_type in [RuntimeType.ONNX_TRT32, RuntimeType.ONNX_TRT16],
            fp16=runtime_type == RuntimeType.ONNX_TRT16,
            warmup_iter=warmup_iter,
            coreml=runtime_type == RuntimeType.ONNX_COREML,
            verbose=False,
        )
    return ONNXRuntime(model_path=model_path, opts=opts, model_info=model_info)

`ONNXRuntime` #

Bases: BaseRuntime

ONNX Runtime wrapper for model inference with different execution providers.

This class implements the BaseRuntime interface for ONNX models, supporting various execution providers like CUDA, TensorRT, OpenVINO, and CoreML. It handles model initialization, provider configuration, warmup, inference, and performance benchmarking.

Attributes:

Name	Type	Description
`name`	`str`	Name of the model derived from the model path.
`opts`	`OnnxRuntimeOpts`	Configuration options for the ONNX runtime.
`model_info`	`RemoteModelInfo`	Metadata about the model.
`ort_sess`	`InferenceSession`	ONNX Runtime inference session.
`active_providers`	`list`	List of active execution providers.
`dtype`	`dtype`	Input data type for the model.

Source code in focoos/infer/runtimes/onnx.py

class ONNXRuntime(BaseRuntime):
    """
    ONNX Runtime wrapper for model inference with different execution providers.

    This class implements the BaseRuntime interface for ONNX models, supporting
    various execution providers like CUDA, TensorRT, OpenVINO, and CoreML.
    It handles model initialization, provider configuration, warmup, inference,
    and performance benchmarking.

    Attributes:
        name (str): Name of the model derived from the model path.
        opts (OnnxRuntimeOpts): Configuration options for the ONNX runtime.
        model_info (RemoteModelInfo): Metadata about the model.
        ort_sess (ort.InferenceSession): ONNX Runtime inference session.
        active_providers (list): List of active execution providers.
        dtype (np.dtype): Input data type for the model.
    """

    def __init__(self, model_path: Union[str, Path], opts: OnnxRuntimeOpts, model_info: ModelInfo):
        logger.debug(f"🔧 [onnxruntime device] {ort.get_device()}")

        self.name = Path(model_path).stem
        self.opts = opts
        self.model_info = model_info

        # Setup session options
        options = ort.SessionOptions()
        options.log_severity_level = 0 if opts.verbose else 2
        options.enable_profiling = opts.verbose

        # Setup providers
        self.providers = self._setup_providers(model_dir=Path(model_path).parent)
        self.active_provider = self.providers[0][0]
        logger.info(f" using: {self.active_provider}")
        # Create session
        self.ort_sess = ort.InferenceSession(model_path, options, providers=self.providers)

        if self.opts.trt and self.providers[0][0] == "TensorrtExecutionProvider":
            logger.info("🟢  TensorRT enabled. First execution may take longer as it builds the TRT engine.")
        # Set input type
        self.dtype = np.uint8 if self.ort_sess.get_inputs()[0].type == "tensor(uint8)" else np.float32

        # Warmup
        if self.opts.warmup_iter > 0:
            self._warmup()

        # inputs = self.ort_sess.get_inputs()
        # outputs = self.ort_sess.get_outputs()
        # for input in inputs:
        #     logger.debug(f"🔧 Input: {input.name} {input.type} {input.shape}")
        # for output in outputs:
        #     logger.debug(f"🔧 Output: {output.name} {output.type} {output.shape}")

    def _setup_providers(self, model_dir: Path):
        providers = []
        available = ort.get_available_providers()
        logger.debug(f"Available providers:{available}")
        _dir = Path(model_dir)
        models_root = _dir.parent
        # Check and add providers in order of preference
        provider_configs = [
            (
                "TensorrtExecutionProvider",
                self.opts.trt,
                {
                    "device_id": GPU_ID,
                    "trt_fp16_enable": self.opts.fp16,
                    "trt_force_sequential_engine_build": False,
                    "trt_engine_cache_enable": True,
                    "trt_engine_cache_path": str(_dir / ".trt_cache"),
                    "trt_ep_context_file_path": str(_dir),
                    "trt_timing_cache_enable": True,  # Timing cache can be shared across multiple models if layers are the same
                    "trt_builder_optimization_level": 3,
                    "trt_timing_cache_path": str(models_root / ".trt_timing_cache"),
                },
            ),
            (
                "OpenVINOExecutionProvider",
                self.opts.vino,
                {"device_type": "MYRIAD_FP16", "enable_vpu_fast_compile": True, "num_of_threads": 1},
            ),
            (
                "CUDAExecutionProvider",
                self.opts.cuda,
                {
                    "device_id": GPU_ID,
                    "arena_extend_strategy": "kSameAsRequested",
                    "gpu_mem_limit": 16 * 1024 * 1024 * 1024,
                    "cudnn_conv_algo_search": "EXHAUSTIVE",
                    "do_copy_in_default_stream": True,
                },
            ),
            ("CoreMLExecutionProvider", self.opts.coreml, {}),
        ]

        for provider, enabled, config in provider_configs:
            if enabled and provider in available:
                providers.append((provider, config))
            elif enabled:
                logger.warning(f"{provider} not found.")

        providers.append(("CPUExecutionProvider", {}))
        return providers

    def _warmup(self):
        size = self.model_info.im_size
        logger.info(f"⏱️ Warming up model {self.name} on {self.active_provider}, size: {size}x{size}..")
        np_image = np.random.rand(1, 3, size, size).astype(self.dtype)
        input_name = self.ort_sess.get_inputs()[0].name
        out_name = [output.name for output in self.ort_sess.get_outputs()]

        for _ in range(self.opts.warmup_iter):
            self.ort_sess.run(out_name, {input_name: np_image})

    def __call__(self, im: torch.Tensor) -> list[np.ndarray]:
        """
        Run inference on the input image.

        Args:
            im (np.ndarray): Input image as a numpy array.

        Returns:
            list[np.ndarray]: Model outputs as a list of numpy arrays.
        """
        input_name = self.ort_sess.get_inputs()[0].name
        out_name = [output.name for output in self.ort_sess.get_outputs()]
        out = self.ort_sess.run(out_name, {input_name: im.cpu().numpy()})
        return out

    def benchmark(self, iterations: int = 50, size: Union[int, Tuple[int, int]] = 640) -> LatencyMetrics:
        """
        Benchmark the model performance.

        Runs multiple inference iterations and measures execution time to calculate
        performance metrics like FPS, mean latency, and other statistics.

        Args:
            iterations (int, optional): Number of inference iterations to run. Defaults to 20.
            size (int or tuple, optional): Input image size for benchmarking. Defaults to 640.

        Returns:
            LatencyMetrics: Performance metrics including FPS, mean, min, max, and std latencies.
        """
        engine = f"onnx.{self.active_provider}"
        device_name = get_device_name()
        if self.active_provider == "CPUExecutionProvider":
            device_name = get_cpu_name()
        if isinstance(size, int):
            size = (size, size)

        logger.info(f"⏱️ Benchmarking latency on {device_name}, size: {size}..")

        np_input = (255 * np.random.random((1, 3, size[0], size[1]))).astype(self.dtype)
        input_name = self.ort_sess.get_inputs()[0].name
        out_name = [output.name for output in self.ort_sess.get_outputs()]

        durations = []
        for step in range(iterations + 5):
            start = perf_counter()
            self.ort_sess.run(out_name, {input_name: np_input})
            end = perf_counter()

            if step >= 5:  # Skip first 5 iterations
                durations.append((end - start) * 1000)

        durations = np.array(durations)

        metrics = LatencyMetrics(
            fps=int(1000 / durations.mean()),
            engine=engine,
            mean=round(durations.mean().astype(float), 3),
            max=round(durations.max().astype(float), 3),
            min=round(durations.min().astype(float), 3),
            std=round(durations.std().astype(float), 3),
            im_size=size[0],  # FIXME: this is a hack to get the im_size as int, assuming it's a square
            device=device_name,
        )
        logger.info(f"🔥 FPS: {metrics.fps} Mean latency: {metrics.mean} ms ")
        return metrics

`call(im)` #

Run inference on the input image.

Parameters:

Name	Type	Description	Default
`im`	`ndarray`	Input image as a numpy array.	required

Returns:

Type	Description
`list[ndarray]`	list[np.ndarray]: Model outputs as a list of numpy arrays.

Source code in focoos/infer/runtimes/onnx.py

def __call__(self, im: torch.Tensor) -> list[np.ndarray]:
    """
    Run inference on the input image.

    Args:
        im (np.ndarray): Input image as a numpy array.

    Returns:
        list[np.ndarray]: Model outputs as a list of numpy arrays.
    """
    input_name = self.ort_sess.get_inputs()[0].name
    out_name = [output.name for output in self.ort_sess.get_outputs()]
    out = self.ort_sess.run(out_name, {input_name: im.cpu().numpy()})
    return out

`benchmark(iterations=50, size=640)` #

Benchmark the model performance.

Runs multiple inference iterations and measures execution time to calculate performance metrics like FPS, mean latency, and other statistics.

Parameters:

Name	Type	Description	Default
`iterations`	`int`	Number of inference iterations to run. Defaults to 20.	`50`
`size`	`int or tuple`	Input image size for benchmarking. Defaults to 640.	`640`

Returns:

Name	Type	Description
`LatencyMetrics`	`LatencyMetrics`	Performance metrics including FPS, mean, min, max, and std latencies.

Source code in focoos/infer/runtimes/onnx.py

def benchmark(self, iterations: int = 50, size: Union[int, Tuple[int, int]] = 640) -> LatencyMetrics:
    """
    Benchmark the model performance.

    Runs multiple inference iterations and measures execution time to calculate
    performance metrics like FPS, mean latency, and other statistics.

    Args:
        iterations (int, optional): Number of inference iterations to run. Defaults to 20.
        size (int or tuple, optional): Input image size for benchmarking. Defaults to 640.

    Returns:
        LatencyMetrics: Performance metrics including FPS, mean, min, max, and std latencies.
    """
    engine = f"onnx.{self.active_provider}"
    device_name = get_device_name()
    if self.active_provider == "CPUExecutionProvider":
        device_name = get_cpu_name()
    if isinstance(size, int):
        size = (size, size)

    logger.info(f"⏱️ Benchmarking latency on {device_name}, size: {size}..")

    np_input = (255 * np.random.random((1, 3, size[0], size[1]))).astype(self.dtype)
    input_name = self.ort_sess.get_inputs()[0].name
    out_name = [output.name for output in self.ort_sess.get_outputs()]

    durations = []
    for step in range(iterations + 5):
        start = perf_counter()
        self.ort_sess.run(out_name, {input_name: np_input})
        end = perf_counter()

        if step >= 5:  # Skip first 5 iterations
            durations.append((end - start) * 1000)

    durations = np.array(durations)

    metrics = LatencyMetrics(
        fps=int(1000 / durations.mean()),
        engine=engine,
        mean=round(durations.mean().astype(float), 3),
        max=round(durations.max().astype(float), 3),
        min=round(durations.min().astype(float), 3),
        std=round(durations.std().astype(float), 3),
        im_size=size[0],  # FIXME: this is a hack to get the im_size as int, assuming it's a square
        device=device_name,
    )
    logger.info(f"🔥 FPS: {metrics.fps} Mean latency: {metrics.mean} ms ")
    return metrics

`TorchscriptRuntime` #

Bases: BaseRuntime

TorchScript Runtime wrapper for model inference.

This class implements the BaseRuntime interface for TorchScript models, supporting both CPU and CUDA devices. It handles model initialization, device placement, warmup, inference, and performance benchmarking.

Attributes:

Name	Type	Description
`device`	`device`	Device to run inference on (CPU or CUDA).
`opts`	`TorchscriptRuntimeOpts`	Configuration options for the TorchScript runtime.
`model`	`ScriptModule`	Loaded TorchScript model.
`model_info`	`RemoteModelInfo`	Metadata about the model.

Source code in focoos/infer/runtimes/torchscript.py

class TorchscriptRuntime(BaseRuntime):
    """
    TorchScript Runtime wrapper for model inference.

    This class implements the BaseRuntime interface for TorchScript models,
    supporting both CPU and CUDA devices. It handles model initialization,
    device placement, warmup, inference, and performance benchmarking.

    Attributes:
        device (torch.device): Device to run inference on (CPU or CUDA).
        opts (TorchscriptRuntimeOpts): Configuration options for the TorchScript runtime.
        model (torch.jit.ScriptModule): Loaded TorchScript model.
        model_info (RemoteModelInfo): Metadata about the model.
    """

    def __init__(
        self,
        model_path: str,
        opts: TorchscriptRuntimeOpts,
        model_info: ModelInfo,
        device: Literal["cuda", "cpu", "auto"] = "auto",
    ):
        if device == "auto":
            self.device = torch.device(get_device_type())
        else:
            self.device = torch.device(device)
        logger.info(f"🔧 Device: {self.device}")
        self.opts = opts
        self.model_info = model_info

        map_location = None if torch.cuda.is_available() else "cpu"

        self.model = torch.jit.load(model_path, map_location=map_location)
        self.model = self.model.to(self.device)

        if self.opts.warmup_iter > 0:
            size = (
                self.model_info.im_size
                if self.model_info.task in [Task.DETECTION, Task.CLASSIFICATION] and self.model_info.im_size
                else 640
            )
            logger.info(f"⏱️ Warming up model {self.model_info.name} on {self.device}, size: {size}x{size}..")
            with torch.no_grad():
                np_image = torch.rand(1, 3, size, size).to(self.device)
                for _ in range(self.opts.warmup_iter):
                    self.model(np_image)

    def __call__(self, im: torch.Tensor) -> list[np.ndarray]:
        """
        Run inference on the input image.

        Args:
            im (np.ndarray): Input image as a numpy array.

        Returns:
            list[np.ndarray]: Model outputs as a list of numpy arrays.
        """
        with torch.no_grad():
            res = self.model(im)
            return res

    def benchmark(self, iterations: int = 20, size: Union[int, Tuple[int, int]] = 640) -> LatencyMetrics:
        """
        Benchmark the model performance.

        Runs multiple inference iterations and measures execution time to calculate
        performance metrics like FPS, mean latency, and other statistics.

        Args:
            iterations (int, optional): Number of inference iterations to run. Defaults to 20.

        Returns:
            LatencyMetrics: Performance metrics including FPS, mean, min, max, and std latencies.
        """
        engine = "torchscript"
        if self.device.type == "cpu":
            device_name = get_cpu_name()
        else:
            device_name = get_device_name()
        logger.info(f"⏱️ Benchmarking latency on {device_name}, size: {size}x{size}..")

        if isinstance(size, int):
            size = (size, size)

        torch_input = torch.rand(1, 3, size[0], size[1], device=self.device)
        durations = []

        with torch.no_grad():
            for step in range(iterations + 5):
                start = perf_counter()
                self.model(torch_input)
                end = perf_counter()

                if step >= 5:  # Skip first 5 iterations
                    durations.append((end - start) * 1000)

        durations = np.array(durations)

        metrics = LatencyMetrics(
            fps=int(1000 / durations.mean().astype(float)),
            engine=engine,
            mean=round(durations.mean().astype(float), 3),
            max=round(durations.max().astype(float), 3),
            min=round(durations.min().astype(float), 3),
            std=round(durations.std().astype(float), 3),
            im_size=size[0],
            device=device_name,
        )
        logger.info(f"🔥 FPS: {metrics.fps} Mean latency: {metrics.mean} ms ")
        return metrics

`call(im)` #

Run inference on the input image.

Parameters:

Name	Type	Description	Default
`im`	`ndarray`	Input image as a numpy array.	required

Returns:

Type	Description
`list[ndarray]`	list[np.ndarray]: Model outputs as a list of numpy arrays.

Source code in focoos/infer/runtimes/torchscript.py

def __call__(self, im: torch.Tensor) -> list[np.ndarray]:
    """
    Run inference on the input image.

    Args:
        im (np.ndarray): Input image as a numpy array.

    Returns:
        list[np.ndarray]: Model outputs as a list of numpy arrays.
    """
    with torch.no_grad():
        res = self.model(im)
        return res

`benchmark(iterations=20, size=640)` #

Benchmark the model performance.

Runs multiple inference iterations and measures execution time to calculate performance metrics like FPS, mean latency, and other statistics.

Parameters:

Name	Type	Description	Default
`iterations`	`int`	Number of inference iterations to run. Defaults to 20.	`20`

Returns:

Name	Type	Description
`LatencyMetrics`	`LatencyMetrics`	Performance metrics including FPS, mean, min, max, and std latencies.

Source code in focoos/infer/runtimes/torchscript.py

def benchmark(self, iterations: int = 20, size: Union[int, Tuple[int, int]] = 640) -> LatencyMetrics:
    """
    Benchmark the model performance.

    Runs multiple inference iterations and measures execution time to calculate
    performance metrics like FPS, mean latency, and other statistics.

    Args:
        iterations (int, optional): Number of inference iterations to run. Defaults to 20.

    Returns:
        LatencyMetrics: Performance metrics including FPS, mean, min, max, and std latencies.
    """
    engine = "torchscript"
    if self.device.type == "cpu":
        device_name = get_cpu_name()
    else:
        device_name = get_device_name()
    logger.info(f"⏱️ Benchmarking latency on {device_name}, size: {size}x{size}..")

    if isinstance(size, int):
        size = (size, size)

    torch_input = torch.rand(1, 3, size[0], size[1], device=self.device)
    durations = []

    with torch.no_grad():
        for step in range(iterations + 5):
            start = perf_counter()
            self.model(torch_input)
            end = perf_counter()

            if step >= 5:  # Skip first 5 iterations
                durations.append((end - start) * 1000)

    durations = np.array(durations)

    metrics = LatencyMetrics(
        fps=int(1000 / durations.mean().astype(float)),
        engine=engine,
        mean=round(durations.mean().astype(float), 3),
        max=round(durations.max().astype(float), 3),
        min=round(durations.min().astype(float), 3),
        std=round(durations.std().astype(float), 3),
        im_size=size[0],
        device=device_name,
    )
    logger.info(f"🔥 FPS: {metrics.fps} Mean latency: {metrics.mean} ms ")
    return metrics

runtimes

BaseRuntime #

__call__(im) abstractmethod #

__init__(model_path, opts, model_info) #

benchmark(iterations, size) abstractmethod #

load_runtime(runtime_type, model_path, model_info, warmup_iter=50, device='auto') #

ONNXRuntime #

__call__(im) #

benchmark(iterations=50, size=640) #

TorchscriptRuntime #

__call__(im) #

benchmark(iterations=20, size=640) #

`BaseRuntime` #

`call(im)` `abstractmethod` #

`init(model_path, opts, model_info)` #

`benchmark(iterations, size)` `abstractmethod` #

`load_runtime(runtime_type, model_path, model_info, warmup_iter=50, device='auto')` #

`ONNXRuntime` #

`call(im)` #

`benchmark(iterations=50, size=640)` #

`TorchscriptRuntime` #

`call(im)` #

`benchmark(iterations=20, size=640)` #