VOYAGE_API_KEY
from io import BytesIOimport pymupdfimport requests
response = requests.get("https://arxiv.org/pdf/2501.12948")pdf_stream = BytesIO(response.content)pdf = pymupdf.open(stream=pdf_stream, filetype="pdf")
from google.cloud import storage
GCS_PROJECT = "mongodb"GCS_BUCKET = "tutorials"gcs_client = storage.Client(project=GCS_PROJECT)gcs_bucket = gcs_client.bucket(GCS_BUCKET)
def upload_image_to_gcs(key: str, data: bytes) -> None: blob = gcs_bucket.blob(key) blob.upload_from_string(data, content_type="image/png")
from tqdm import tqdm
docs = []zoom = 3.0mat = pymupdf.Matrix(zoom, zoom)
for n in tqdm(range(pdf.page_count)): pix = pdf[n].get_pixmap(matrix=mat) img_bytes = pix.tobytes("png") gcs_key = f"multimodal-rag/{n+1}.png" upload_image_to_gcs(gcs_key, img_bytes) docs.append({ "gcs_key": gcs_key, "width": pix.width, "height": pix.height, "image": img_bytes, })
voyage-multimodal-3
clip-ViT-B-32
from voyageai import Client as VoyageClientfrom sentence_transformers import SentenceTransformerfrom PIL import Imagefrom io import BytesIO
voyageai_client = VoyageClient()clip_model = SentenceTransformer("clip-ViT-B-32")
def get_voyage_embedding(data, input_type): embedding = voyageai_client.multimodal_embed( inputs=[[data]], model="voyage-multimodal-3", input_type=input_type, ).embeddings[0] return embedding
def get_clip_embedding(data): embedding = clip_model.encode(data).tolist() return embedding
embedded_docs = []for doc in tqdm(docs): img = Image.open(BytesIO(doc["image"])) doc["voyage_embedding"] = get_voyage_embedding(img, "document") doc["clip_embedding"] = get_clip_embedding(img) del doc["image"] embedded_docs.append(doc)
from pymongo import MongoClient
mongodb_client = MongoClient(MONGODB_URI)collection = mongodb_client["mongodb"]["multimodal_rag"]collection.delete_many({})collection.insert_many(embedded_docs)
VS_INDEX_NAME = "vector_index"
model = { "name": VS_INDEX_NAME, "type": "vectorSearch", "definition": { "fields": [ { "type": "vector", "path": "voyage_embedding", "numDimensions": 1024, "similarity": "cosine", }, { "type": "vector", "path": "clip_embedding", "numDimensions": 512, "similarity": "cosine", }, ], },}
collection.create_search_index(model=model)
def get_image_from_gcs(key: str) -> bytes: blob = gcs_bucket.blob(key) return blob.download_as_bytes()
def vector_search(user_query: str, model: str, display_images=True): if model == "voyage": query_embedding = get_voyage_embedding(user_query, "query") else: query_embedding = get_clip_embedding(user_query)
pipeline = [ { "$vectorSearch": { "index": VS_INDEX_NAME, "queryVector": query_embedding, "path": f"{model}_embedding", "numCandidates": 150, "limit": 5, } }, { "$project": { "_id": 0, "gcs_key": 1, "width": 1, "height": 1, "score": {"$meta": "vectorSearchScore"}, } }, ]
results = collection.aggregate(pipeline) gcs_keys = [] for result in results: if display_images: img = Image.open(BytesIO(get_image_from_gcs(result["gcs_key"]))) print(f"Score: {result['score']}\n") display(img) gcs_keys.append(result["gcs_key"]) return gcs_keys
from google import genaifromfrom google.genai import types
gemini_client = genai.Client(api_key=GEMINI_API_KEY)LLM = "gemini-2.0-flash"
def generate_answer(user_query: str, model: str) -> str: gcs_keys = vector_search(user_query, model, display_images=False) images = [Image.open(BytesIO(get_image_from_gcs(key))) for key in gcs_keys] prompt = f"Answer the question based only on the provided context. If the context is empty, say I DON'T KNOW\n\nQuestion:{user_query}\n\nContext:\n" messages = [prompt] + images
response = gemini_client.models.generate_content( model=LLM, contents=messages, config=types.GenerateContentConfig(temperature=0.0), ) return response.text
Mô Hình | MRR (Mean Reciprocal Rank) | Recall@5 | Điểm Sinh Tổng Thể (1-5) |
---|---|---|---|
voyage-multimodal-3 | Cao hơn CLIP | Cao hơn | 4.2 |
clip-ViT-B-32 | Thấp hơn | Thấp hơn | 3.5 |