Added PDF support and sliding window chunking

2026-02-06 15:11:46 +11:00
parent 8cb860797e
commit 849fbaa936
1 changed files with 70 additions and 26 deletions
--- a/server.py
+++ b/server.py
@@ -5,6 +5,8 @@ from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 import uuid
 import logging
+import io
+from pypdf import PdfReader

 # Configuration
 QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.openshift-gitops.svc:6333")
@@ -17,17 +19,28 @@ qdrant = QdrantClient(url=QDRANT_URL)

 def get_embedding(text: str) -> list[float]:
    """Get embedding from TEI."""
-    url = f"{TEI_URL}/embed" # Adjust based on TEI version, often /v1/embeddings or /embed
-    # Trying standard TEI /embed endpoint for raw lists
+    url = f"{TEI_URL}/embed"
    try:
        response = httpx.post(url, json={"inputs": text}, timeout=10.0)
        response.raise_for_status()
        return response.json()[0]
    except Exception as e:
-        # Fallback to OpenAI compatible endpoint if needed
        logging.error(f"Embedding failed: {e}")
        raise

+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
+    """Sliding window chunking."""
+    if len(text) <= chunk_size:
+        return [text]
+    
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunks.append(text[start:end])
+        start += (chunk_size - overlap)
+    return chunks
+
@mcp.tool()
 def create_notebook(name: str) -> str:
    """Create a new RAG notebook (Qdrant collection)."""
@@ -44,25 +57,50 @@ def create_notebook(name: str) -> str:
    return f"Notebook '{clean_name}' created successfully."

@mcp.tool()
-def add_source(notebook: str, text: str, source_name: str = "manual") -> str:
-    """Add text content to a notebook. Ingests, chunks, and indexes."""
+def add_source(notebook: str, content: str, source_name: str, format: str = "text") -> str:
+    """
+    Add content to a notebook.
+    format: 'text' or 'pdf_path' (local path inside container)
+    """
    if not qdrant.collection_exists(notebook):
        return f"Error: Notebook '{notebook}' does not exist."

-    # Very basic chunking for now
-    chunks = [text[i:i+500] for i in range(0, len(text), 500)]
+    text_to_process = ""
+    
+    if format == "pdf_path":
+        try:
+            reader = PdfReader(content)
+            for page in reader.pages:
+                text_to_process += page.extract_text() + "\n"
+        except Exception as e:
+            return f"Error reading PDF: {e}"
+    else:
+        text_to_process = content
+
+    chunks = chunk_text(text_to_process)
    points = []

-    for chunk in chunks:
+    for i, chunk in enumerate(chunks):
+        try:
            vector = get_embedding(chunk)
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=vector,
-            payload={"source": source_name, "text": chunk}
+                payload={
+                    "source": source_name,
+                    "text": chunk,
+                    "chunk_index": i,
+                    "total_chunks": len(chunks)
+                }
            ))
+        except Exception as e:
+            logging.error(f"Failed to embed chunk {i}: {e}")
+            continue

+    if points:
        qdrant.upsert(collection_name=notebook, points=points)
        return f"Added {len(points)} chunks from '{source_name}' to '{notebook}'."
+    return "No content added (empty or failed)."

@mcp.tool()
 def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
@@ -70,6 +108,7 @@ def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
    if not qdrant.collection_exists(notebook):
        return f"Error: Notebook '{notebook}' does not exist."

+    try:
        vector = get_embedding(query)
        hits = qdrant.search(
            collection_name=notebook,
@@ -79,9 +118,14 @@ def query_notebook(notebook: str, query: str, limit: int = 5) -> str:

        results = []
        for hit in hits:
-        results.append(f"--- (Score: {hit.score:.2f}) ---\n{hit.payload.get('text', '')}\n")
+            score = hit.score
+            text = hit.payload.get('text', '').replace('\n', ' ')
+            source = hit.payload.get('source', 'unknown')
+            results.append(f"[{score:.2f}] {source}: {text}...")

        return "\n".join(results)
+    except Exception as e:
+        return f"Query failed: {e}"

 if __name__ == "__main__":
    mcp.run()