I’m facing considerable latency issues when using Gemini-Embedding-001 via the Vertex AI API.
Below code takes between 11 and 12 seconds to run.
Warming up, batch size, etc. do not affect the aforementioned values.
I wanted to ask whether this is expected behavior or whether I’m missing something.
Such latencies essentially double my pipeline’s running duration.
This is how I initialize and instantiate my client:
import logging
from google.oauth2.service_account import Credentials
from Configurations import configurations
from google.genai import Client
from google.cloud import storage
class GoogleClients:
def __init__(self) -> None:
try:
credentials = Credentials.from_service_account_file(filename = configurations.CREDENTIALS_PATH, scopes = ['https://www.googleapis.com/auth/cloud-platform'])
self.VertexAIClient = Client(vertexai = True, project = configurations.GOOGLE_PROJECT_ID, location = 'global', credentials = credentials)
self.storageClient = storage.Client(project = configurations.GOOGLE_PROJECT_ID, credentials = credentials)
logging.info('GoogleClients initialization completed successfully.')
except Exception as error:
raise RuntimeError(f'Failed to initialize GoogleClients. Aborting application: {error}.') from error
clients = GoogleClients()
And this is how I call the embedding model:
import logging
import asyncio
import time
from GoogleClients import clients
from Configurations import configurations
from typing import Optional
from tqdm.asyncio import tqdm_asyncio
class DenseEmbedder:
def __init__(self) -> None:
try:
self.vertexAIClient = clients.VertexAIClient
self.semaphore = asyncio.Semaphore(configurations.ASYNCIO_SEMAPHORE)
logging.info('DenseEmbedder initialization completed successfully.')
except Exception as error:
raise RuntimeError(f'Failed to initialize DenseEmbedder. Aborting application: {error}.') from error
async def embedBatch(self, batch: list[str], taskType: str) -> Optional[list[list[float]]]:
try:
async with self.semaphore:
logging.info(f'Sending batch of {len(batch)} item(s) to Vertex AI embedding API.')
startTime = time.perf_counter()
response = await self.vertexAIClient.aio.models.embed_content(model = configurations.DENSE_EMBEDDING_MODEL, contents = batch, config = {'task_type': taskType})
endTime = time.perf_counter()
duration = endTime - startTime
logging.info(f'Vertex AI API call for batch of {len(batch)} completed in {duration:.4f} seconds.')
return [embedding.values for embedding in response.embeddings]
except Exception as error:
logging.warning(f'Failed to generate embeddings for a batch: {error}.')
return None
async def embedBatches(self, texts: list[str], taskType: str) -> list[list[float]]:
try:
allEmbeddings = []
batchSize = configurations.DENSE_EMBEDDING_BATCH_SIZE
tasks = [self.embedBatch(texts[i:i + batchSize], taskType) for i in range(0, len(texts), batchSize)]
batchResults = await tqdm_asyncio.gather(*tasks, desc = 'Generating dense embeddings...')
for batchEmbeddings in batchResults:
if batchEmbeddings:
allEmbeddings.extend(batchEmbeddings)
logging.info('Dense embedding generation completed successfully.')
return allEmbeddings
except Exception as error:
raise RuntimeError(f'Failed to generate dense embeddings: {error}') from error
logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levelname)s - %(message)s', force = True)
denseEmbedder = DenseEmbedder()
dummyText = ["This is a sample text for embedding."]
embeddings = asyncio.run(denseEmbedder.embedBatch(batch = dummyText, taskType = 'RETRIEVAL_DOCUMENT'))
Below is indicative logging output.
2025-11-01 22:12:08,790 - INFO - DenseEmbedder initialization completed successfully.
2025-11-01 22:12:08,790 - INFO - Sending batch of 1 item(s) to Vertex AI embedding API.
2025-11-01 22:12:20,587 - INFO - Vertex AI API call for batch of 1 completed in 11.7834 seconds.