Added PDF support and sliding window chunking

This commit is contained in:
Clawdbot
2026-02-06 15:11:46 +11:00
parent 8cb860797e
commit 849fbaa936

View File

@@ -5,6 +5,8 @@ from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid import uuid
import logging import logging
import io
from pypdf import PdfReader
# Configuration # Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.openshift-gitops.svc:6333") QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.openshift-gitops.svc:6333")
@@ -17,17 +19,28 @@ qdrant = QdrantClient(url=QDRANT_URL)
def get_embedding(text: str) -> list[float]: def get_embedding(text: str) -> list[float]:
"""Get embedding from TEI.""" """Get embedding from TEI."""
url = f"{TEI_URL}/embed" # Adjust based on TEI version, often /v1/embeddings or /embed url = f"{TEI_URL}/embed"
# Trying standard TEI /embed endpoint for raw lists
try: try:
response = httpx.post(url, json={"inputs": text}, timeout=10.0) response = httpx.post(url, json={"inputs": text}, timeout=10.0)
response.raise_for_status() response.raise_for_status()
return response.json()[0] return response.json()[0]
except Exception as e: except Exception as e:
# Fallback to OpenAI compatible endpoint if needed
logging.error(f"Embedding failed: {e}") logging.error(f"Embedding failed: {e}")
raise raise
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""Sliding window chunking."""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += (chunk_size - overlap)
return chunks
@mcp.tool() @mcp.tool()
def create_notebook(name: str) -> str: def create_notebook(name: str) -> str:
"""Create a new RAG notebook (Qdrant collection).""" """Create a new RAG notebook (Qdrant collection)."""
@@ -44,25 +57,50 @@ def create_notebook(name: str) -> str:
return f"Notebook '{clean_name}' created successfully." return f"Notebook '{clean_name}' created successfully."
@mcp.tool() @mcp.tool()
def add_source(notebook: str, text: str, source_name: str = "manual") -> str: def add_source(notebook: str, content: str, source_name: str, format: str = "text") -> str:
"""Add text content to a notebook. Ingests, chunks, and indexes.""" """
Add content to a notebook.
format: 'text' or 'pdf_path' (local path inside container)
"""
if not qdrant.collection_exists(notebook): if not qdrant.collection_exists(notebook):
return f"Error: Notebook '{notebook}' does not exist." return f"Error: Notebook '{notebook}' does not exist."
# Very basic chunking for now text_to_process = ""
chunks = [text[i:i+500] for i in range(0, len(text), 500)]
if format == "pdf_path":
try:
reader = PdfReader(content)
for page in reader.pages:
text_to_process += page.extract_text() + "\n"
except Exception as e:
return f"Error reading PDF: {e}"
else:
text_to_process = content
chunks = chunk_text(text_to_process)
points = [] points = []
for chunk in chunks: for i, chunk in enumerate(chunks):
vector = get_embedding(chunk) try:
points.append(PointStruct( vector = get_embedding(chunk)
id=str(uuid.uuid4()), points.append(PointStruct(
vector=vector, id=str(uuid.uuid4()),
payload={"source": source_name, "text": chunk} vector=vector,
)) payload={
"source": source_name,
"text": chunk,
"chunk_index": i,
"total_chunks": len(chunks)
}
))
except Exception as e:
logging.error(f"Failed to embed chunk {i}: {e}")
continue
qdrant.upsert(collection_name=notebook, points=points) if points:
return f"Added {len(points)} chunks from '{source_name}' to '{notebook}'." qdrant.upsert(collection_name=notebook, points=points)
return f"Added {len(points)} chunks from '{source_name}' to '{notebook}'."
return "No content added (empty or failed)."
@mcp.tool() @mcp.tool()
def query_notebook(notebook: str, query: str, limit: int = 5) -> str: def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
@@ -70,18 +108,24 @@ def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
if not qdrant.collection_exists(notebook): if not qdrant.collection_exists(notebook):
return f"Error: Notebook '{notebook}' does not exist." return f"Error: Notebook '{notebook}' does not exist."
vector = get_embedding(query) try:
hits = qdrant.search( vector = get_embedding(query)
collection_name=notebook, hits = qdrant.search(
query_vector=vector, collection_name=notebook,
limit=limit query_vector=vector,
) limit=limit
)
results = [] results = []
for hit in hits: for hit in hits:
results.append(f"--- (Score: {hit.score:.2f}) ---\n{hit.payload.get('text', '')}\n") score = hit.score
text = hit.payload.get('text', '').replace('\n', ' ')
source = hit.payload.get('source', 'unknown')
results.append(f"[{score:.2f}] {source}: {text}...")
return "\n".join(results) return "\n".join(results)
except Exception as e:
return f"Query failed: {e}"
if __name__ == "__main__": if __name__ == "__main__":
mcp.run() mcp.run()