From 8cb860797e101ab6ef8ed72c307cc3b5e08787a2 Mon Sep 17 00:00:00 2001 From: Clawdbot Date: Fri, 6 Feb 2026 15:07:07 +1100 Subject: [PATCH] Initial scaffold for knowledge-mcp --- README.md | 40 ++++++++++++++++++++++ requirements.txt | 6 ++++ server.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 server.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..cbca63a --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# knowledge-mcp + +A Model Context Protocol (MCP) server that provides scoped RAG workspaces ("Notebooks") backed by **Qdrant** and **TEI**. + +## Overview + +This server enables an agent to: +1. Create named "Notebooks" (Qdrant Collections). +2. Ingest documents (PDF, Markdown, Text) into specific notebooks. +3. Query specific notebooks using vector search (RAG). +4. Synthesize findings across a notebook. + +Designed to replicate the **NotebookLM** experience: clean, focused, bounded context. + +## Stack +* **Language:** Python 3.11+ +* **Framework:** `mcp` SDK +* **Vector DB:** Qdrant +* **Embeddings:** Text Embeddings Inference (TEI) - `BAAI/bge-base-en-v1.5` + +## Tools + +### `notebook.create` +Creates a new isolated workspace (Qdrant Collection). +- `name`: string (e.g., "project-alpha") + +### `notebook.add_source` +Ingests a document into the notebook. +- `notebook`: string +- `url`: string (URL or local path) + +### `notebook.query` +Performs a semantic search/RAG generation against the notebook. +- `notebook`: string +- `query`: string + +## Configuration +Env vars: +- `QDRANT_URL`: URL to Qdrant (e.g., `http://qdrant.openshift-gitops.svc:6333`) +- `TEI_URL`: URL to TEI (e.g., `http://text-embeddings.tei.svc.cluster.local:8080`) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e844665 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +mcp +httpx +qdrant-client +beautifulsoup4 +pypdf +python-dotenv diff --git a/server.py b/server.py new file mode 100644 index 0000000..86536ff --- /dev/null +++ b/server.py @@ -0,0 +1,87 @@ +import os +import httpx +from mcp.server.fastmcp import FastMCP +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct +import uuid +import logging + +# Configuration +QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant.openshift-gitops.svc:6333") +TEI_URL = os.getenv("TEI_URL", "http://text-embeddings.tei.svc.cluster.local:8080") +EMBEDDING_DIM = 768 # BAAI/bge-base-en-v1.5 + +# Initialize +mcp = FastMCP("knowledge-mcp") +qdrant = QdrantClient(url=QDRANT_URL) + +def get_embedding(text: str) -> list[float]: + """Get embedding from TEI.""" + url = f"{TEI_URL}/embed" # Adjust based on TEI version, often /v1/embeddings or /embed + # Trying standard TEI /embed endpoint for raw lists + try: + response = httpx.post(url, json={"inputs": text}, timeout=10.0) + response.raise_for_status() + return response.json()[0] + except Exception as e: + # Fallback to OpenAI compatible endpoint if needed + logging.error(f"Embedding failed: {e}") + raise + +@mcp.tool() +def create_notebook(name: str) -> str: + """Create a new RAG notebook (Qdrant collection).""" + clean_name = name.lower().replace(" ", "-") + + # Check if exists + if qdrant.collection_exists(clean_name): + return f"Notebook '{clean_name}' already exists." + + qdrant.create_collection( + collection_name=clean_name, + vectors_config=VectorParams(size=EMBEDDING_DIM, distance=Distance.COSINE), + ) + return f"Notebook '{clean_name}' created successfully." + +@mcp.tool() +def add_source(notebook: str, text: str, source_name: str = "manual") -> str: + """Add text content to a notebook. Ingests, chunks, and indexes.""" + if not qdrant.collection_exists(notebook): + return f"Error: Notebook '{notebook}' does not exist." + + # Very basic chunking for now + chunks = [text[i:i+500] for i in range(0, len(text), 500)] + points = [] + + for chunk in chunks: + vector = get_embedding(chunk) + points.append(PointStruct( + id=str(uuid.uuid4()), + vector=vector, + payload={"source": source_name, "text": chunk} + )) + + qdrant.upsert(collection_name=notebook, points=points) + return f"Added {len(points)} chunks from '{source_name}' to '{notebook}'." + +@mcp.tool() +def query_notebook(notebook: str, query: str, limit: int = 5) -> str: + """Query the notebook for relevant context.""" + if not qdrant.collection_exists(notebook): + return f"Error: Notebook '{notebook}' does not exist." + + vector = get_embedding(query) + hits = qdrant.search( + collection_name=notebook, + query_vector=vector, + limit=limit + ) + + results = [] + for hit in hits: + results.append(f"--- (Score: {hit.score:.2f}) ---\n{hit.payload.get('text', '')}\n") + + return "\n".join(results) + +if __name__ == "__main__": + mcp.run()