Added PDF support and sliding window chunking
This commit is contained in:
64
server.py
64
server.py
@@ -5,6 +5,8 @@ from qdrant_client import QdrantClient
|
|||||||
from qdrant_client.models import Distance, VectorParams, PointStruct
|
from qdrant_client.models import Distance, VectorParams, PointStruct
|
||||||
import uuid
|
import uuid
|
||||||
import logging
|
import logging
|
||||||
|
import io
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.openshift-gitops.svc:6333")
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.openshift-gitops.svc:6333")
|
||||||
@@ -17,17 +19,28 @@ qdrant = QdrantClient(url=QDRANT_URL)
|
|||||||
|
|
||||||
def get_embedding(text: str) -> list[float]:
|
def get_embedding(text: str) -> list[float]:
|
||||||
"""Get embedding from TEI."""
|
"""Get embedding from TEI."""
|
||||||
url = f"{TEI_URL}/embed" # Adjust based on TEI version, often /v1/embeddings or /embed
|
url = f"{TEI_URL}/embed"
|
||||||
# Trying standard TEI /embed endpoint for raw lists
|
|
||||||
try:
|
try:
|
||||||
response = httpx.post(url, json={"inputs": text}, timeout=10.0)
|
response = httpx.post(url, json={"inputs": text}, timeout=10.0)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()[0]
|
return response.json()[0]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback to OpenAI compatible endpoint if needed
|
|
||||||
logging.error(f"Embedding failed: {e}")
|
logging.error(f"Embedding failed: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
|
||||||
|
"""Sliding window chunking."""
|
||||||
|
if len(text) <= chunk_size:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = start + chunk_size
|
||||||
|
chunks.append(text[start:end])
|
||||||
|
start += (chunk_size - overlap)
|
||||||
|
return chunks
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def create_notebook(name: str) -> str:
|
def create_notebook(name: str) -> str:
|
||||||
"""Create a new RAG notebook (Qdrant collection)."""
|
"""Create a new RAG notebook (Qdrant collection)."""
|
||||||
@@ -44,25 +57,50 @@ def create_notebook(name: str) -> str:
|
|||||||
return f"Notebook '{clean_name}' created successfully."
|
return f"Notebook '{clean_name}' created successfully."
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def add_source(notebook: str, text: str, source_name: str = "manual") -> str:
|
def add_source(notebook: str, content: str, source_name: str, format: str = "text") -> str:
|
||||||
"""Add text content to a notebook. Ingests, chunks, and indexes."""
|
"""
|
||||||
|
Add content to a notebook.
|
||||||
|
format: 'text' or 'pdf_path' (local path inside container)
|
||||||
|
"""
|
||||||
if not qdrant.collection_exists(notebook):
|
if not qdrant.collection_exists(notebook):
|
||||||
return f"Error: Notebook '{notebook}' does not exist."
|
return f"Error: Notebook '{notebook}' does not exist."
|
||||||
|
|
||||||
# Very basic chunking for now
|
text_to_process = ""
|
||||||
chunks = [text[i:i+500] for i in range(0, len(text), 500)]
|
|
||||||
|
if format == "pdf_path":
|
||||||
|
try:
|
||||||
|
reader = PdfReader(content)
|
||||||
|
for page in reader.pages:
|
||||||
|
text_to_process += page.extract_text() + "\n"
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error reading PDF: {e}"
|
||||||
|
else:
|
||||||
|
text_to_process = content
|
||||||
|
|
||||||
|
chunks = chunk_text(text_to_process)
|
||||||
points = []
|
points = []
|
||||||
|
|
||||||
for chunk in chunks:
|
for i, chunk in enumerate(chunks):
|
||||||
|
try:
|
||||||
vector = get_embedding(chunk)
|
vector = get_embedding(chunk)
|
||||||
points.append(PointStruct(
|
points.append(PointStruct(
|
||||||
id=str(uuid.uuid4()),
|
id=str(uuid.uuid4()),
|
||||||
vector=vector,
|
vector=vector,
|
||||||
payload={"source": source_name, "text": chunk}
|
payload={
|
||||||
|
"source": source_name,
|
||||||
|
"text": chunk,
|
||||||
|
"chunk_index": i,
|
||||||
|
"total_chunks": len(chunks)
|
||||||
|
}
|
||||||
))
|
))
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to embed chunk {i}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if points:
|
||||||
qdrant.upsert(collection_name=notebook, points=points)
|
qdrant.upsert(collection_name=notebook, points=points)
|
||||||
return f"Added {len(points)} chunks from '{source_name}' to '{notebook}'."
|
return f"Added {len(points)} chunks from '{source_name}' to '{notebook}'."
|
||||||
|
return "No content added (empty or failed)."
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
|
def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
|
||||||
@@ -70,6 +108,7 @@ def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
|
|||||||
if not qdrant.collection_exists(notebook):
|
if not qdrant.collection_exists(notebook):
|
||||||
return f"Error: Notebook '{notebook}' does not exist."
|
return f"Error: Notebook '{notebook}' does not exist."
|
||||||
|
|
||||||
|
try:
|
||||||
vector = get_embedding(query)
|
vector = get_embedding(query)
|
||||||
hits = qdrant.search(
|
hits = qdrant.search(
|
||||||
collection_name=notebook,
|
collection_name=notebook,
|
||||||
@@ -79,9 +118,14 @@ def query_notebook(notebook: str, query: str, limit: int = 5) -> str:
|
|||||||
|
|
||||||
results = []
|
results = []
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
results.append(f"--- (Score: {hit.score:.2f}) ---\n{hit.payload.get('text', '')}\n")
|
score = hit.score
|
||||||
|
text = hit.payload.get('text', '').replace('\n', ' ')
|
||||||
|
source = hit.payload.get('source', 'unknown')
|
||||||
|
results.append(f"[{score:.2f}] {source}: {text}...")
|
||||||
|
|
||||||
return "\n".join(results)
|
return "\n".join(results)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Query failed: {e}"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
mcp.run()
|
mcp.run()
|
||||||
|
|||||||
Reference in New Issue
Block a user