>该教程通过使用Python和Openai构建检索增强发电(RAG)系统,为您引导您。 RAG通过从您的文档中检索相关信息来增强AI的响应,然后再产生答案 - 本质上,让AI“研究”事先进行。
>您将要学到的内容:
项目结构:
<code>rag-project/ │ ├── src/ │ ├── __init__.py │ ├── document_loader.py │ ├── text_processor.py │ ├── embeddings_manager.py │ ├── retrieval_system.py │ └── rag_system.py │ ├── data/ │ └── documents/ │ ├── requirements.txt ├── test.py ├── README.md └── .env</code>
步骤1:环境设置:
python -m venv venv
>
venvScriptsactivate
激活它:source venv/bin/activate
>安装软件包:pip install openai python-dotenv numpy pandas
创建requirements.txt
<code>openai==1.12.0 python-dotenv==1.0.0 numpy==1.24.3 pandas==2.1.0</code>
.env
<code>OPENAI_API_KEY=your_api_key_here</code>
):src/document_loader.py
>
<code class="language-python">import os from typing import List class DocumentLoader: def __init__(self, documents_path: str): self.documents_path = documents_path def load_documents(self) -> List[str]: documents = [] for filename in os.listdir(self.documents_path): if filename.endswith('.txt'): with open(os.path.join(self.documents_path, filename), 'r') as file: documents.append(file.read()) return documents</code>
>
src/text_processor.py
步骤4:嵌入式创建(
<code class="language-python">from typing import List class TextProcessor: def __init__(self, chunk_size: int = 1000): self.chunk_size = chunk_size def split_into_chunks(self, text: str) -> List[str]: words = text.split() chunks = [] current_chunk = [] current_size = 0 for word in words: if current_size + len(word) > self.chunk_size: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_size = len(word) else: current_chunk.append(word) current_size += len(word) + 1 if current_chunk: chunks.append(' '.join(current_chunk)) return chunks</code>
src/embeddings_manager.py
步骤5:检索系统(
<code class="language-python">from typing import List import openai import numpy as np class EmbeddingsManager: def __init__(self, api_key: str): openai.api_key = api_key def create_embeddings(self, texts: List[str]) -> List[np.ndarray]: embeddings = [] for text in texts: response = openai.embeddings.create( model="text-embedding-ada-002", input=text ) embeddings.append(np.array(response.data[0].embedding)) return embeddings</code>
>
src/retrieval_system.py
>步骤6:OpenAI Integration(
<code class="language-python">import numpy as np from typing import List, Tuple class RetrievalSystem: def __init__(self, chunks: List[str], embeddings: List[np.ndarray]): self.chunks = chunks self.embeddings = embeddings def find_similar_chunks(self, query_embedding: np.ndarray, top_k: int = 3) -> List[Tuple[str, float]]: similarities = [] for i, embedding in enumerate(self.embeddings): similarity = np.dot(query_embedding, embedding) / ( np.linalg.norm(query_embedding) * np.linalg.norm(embedding) ) similarities.append((self.chunks[i], similarity)) return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]</code>
>src/rag_system.py
步骤7:系统用法():
<code class="language-python">import os from dotenv import load_dotenv from typing import List import openai from .document_loader import DocumentLoader from .text_processor import TextProcessor from .embeddings_manager import EmbeddingsManager from .retrieval_system import RetrievalSystem class RAGSystem: def __init__(self): load_dotenv() self.api_key = os.getenv('OPENAI_API_KEY') self.loader = DocumentLoader('data/documents') self.processor = TextProcessor() self.embeddings_manager = EmbeddingsManager(self.api_key) # Initialize system self.initialize_system() def initialize_system(self): # Load and process documents documents = self.loader.load_documents() self.chunks = [] for doc in documents: self.chunks.extend(self.processor.split_into_chunks(doc)) # Create embeddings self.embeddings = self.embeddings_manager.create_embeddings(self.chunks) # Initialize retrieval system self.retrieval_system = RetrievalSystem(self.chunks, self.embeddings) def answer_question(self, question: str) -> str: # Get question embedding question_embedding = self.embeddings_manager.create_embeddings([question])[0] # Get relevant chunks relevant_chunks = self.retrieval_system.find_similar_chunks(question_embedding) # Prepare context context = "\n".join([chunk[0] for chunk in relevant_chunks]) # Create prompt prompt = f"""Context: {context}\n\nQuestion: {question}\n\nAnswer:""" # Get response from OpenAI response = openai.chat.completions.create( model="gpt-4-turbo-preview", messages=[ {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer the question."}, {"role": "user", "content": prompt} ] ) return response.choices[0].message.content</code>
>将样本文档放在test.py
>中。 然后,运行:
.txt
data/documents
结论:test.py
<code class="language-python"># test.py from src.rag_system import RAGSystem # Initialize the RAG system rag = RAGSystem() # Ask a question question = "What was the answer to the guardian’s riddle, and how did it help Kai?" #Replace with your question based on your documents answer = rag.answer_question(question) print(answer)</code>
以上是使用Python和Openai构建您的第一个抹布系统的详细内容。更多信息请关注PHP中文网其他相关文章!