Gemini-Embedding-001 Latency Considerations

I’m facing considerable latency issues when using Gemini-Embedding-001 via the Vertex AI API.
Below code takes between 11 and 12 seconds to run.
Warming up, batch size, etc. do not affect the aforementioned values.
I wanted to ask whether this is expected behavior or whether I’m missing something.
Such latencies essentially double my pipeline’s running duration.

This is how I initialize and instantiate my client:

import logging
from google.oauth2.service_account import Credentials
from Configurations import configurations
from google.genai import Client
from google.cloud import storage

class GoogleClients:

    def __init__(self) -> None:
        try:
            credentials = Credentials.from_service_account_file(filename = configurations.CREDENTIALS_PATH, scopes = ['https://www.googleapis.com/auth/cloud-platform'])
            self.VertexAIClient = Client(vertexai = True, project = configurations.GOOGLE_PROJECT_ID, location = 'global', credentials = credentials)
            self.storageClient = storage.Client(project = configurations.GOOGLE_PROJECT_ID, credentials = credentials)
            logging.info('GoogleClients initialization completed successfully.')

        except Exception as error:
            raise RuntimeError(f'Failed to initialize GoogleClients. Aborting application: {error}.') from error

clients = GoogleClients()

And this is how I call the embedding model:

import logging
import asyncio
import time
from GoogleClients import clients
from Configurations import configurations
from typing import Optional
from tqdm.asyncio import tqdm_asyncio

class DenseEmbedder:

    def __init__(self) -> None:
        try:
            self.vertexAIClient = clients.VertexAIClient
            self.semaphore = asyncio.Semaphore(configurations.ASYNCIO_SEMAPHORE)
            logging.info('DenseEmbedder initialization completed successfully.')

        except Exception as error:
            raise RuntimeError(f'Failed to initialize DenseEmbedder. Aborting application: {error}.') from error

    async def embedBatch(self, batch: list[str], taskType: str) -> Optional[list[list[float]]]:
        try:
            async with self.semaphore:
                logging.info(f'Sending batch of {len(batch)} item(s) to Vertex AI embedding API.')
                startTime = time.perf_counter()
                response = await self.vertexAIClient.aio.models.embed_content(model = configurations.DENSE_EMBEDDING_MODEL, contents = batch, config = {'task_type': taskType})
                endTime = time.perf_counter()
                duration = endTime - startTime
                logging.info(f'Vertex AI API call for batch of {len(batch)} completed in {duration:.4f} seconds.')
                return [embedding.values for embedding in response.embeddings]

        except Exception as error:
            logging.warning(f'Failed to generate embeddings for a batch: {error}.')
            return None

    async def embedBatches(self, texts: list[str], taskType: str) -> list[list[float]]:
        try:
            allEmbeddings = []
            batchSize = configurations.DENSE_EMBEDDING_BATCH_SIZE
            tasks = [self.embedBatch(texts[i:i + batchSize], taskType) for i in range(0, len(texts), batchSize)]
            batchResults = await tqdm_asyncio.gather(*tasks, desc = 'Generating dense embeddings...')
            for batchEmbeddings in batchResults:
                if batchEmbeddings:
                    allEmbeddings.extend(batchEmbeddings)

            logging.info('Dense embedding generation completed successfully.')
            return allEmbeddings

        except Exception as error:
            raise RuntimeError(f'Failed to generate dense embeddings: {error}') from error

logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levelname)s - %(message)s', force = True)
denseEmbedder = DenseEmbedder()
dummyText = ["This is a sample text for embedding."]
embeddings = asyncio.run(denseEmbedder.embedBatch(batch = dummyText, taskType = 'RETRIEVAL_DOCUMENT'))

Below is indicative logging output.

2025-11-01 22:12:08,790 - INFO - DenseEmbedder initialization completed successfully.
2025-11-01 22:12:08,790 - INFO - Sending batch of 1 item(s) to Vertex AI embedding API.
2025-11-01 22:12:20,587 - INFO - Vertex AI API call for batch of 1 completed in 11.7834 seconds.

1 Like