このチュートリアルは、PythonとOpenaiを使用して検索拡張生成(RAG)システムを構築することをガイドします。 RAGは、回答を生成する前にドキュメントから関連情報を取得することにより、AI応答を強化します。基本的に、AIに事前に「研究」を許可します。 あなたが学んだこと:
ragシステムをゼロから構築します。
ステップ1:環境のセットアップ:
<code>rag-project/ │ ├── src/ │ ├── __init__.py │ ├── document_loader.py │ ├── text_processor.py │ ├── embeddings_manager.py │ ├── retrieval_system.py │ └── rag_system.py │ ├── data/ │ └── documents/ │ ├── requirements.txt ├── test.py ├── README.md └── .env</code>
python -m venv venv
パッケージのインストール:venvScriptsactivate
source venv/bin/activate
:pip install openai python-dotenv numpy pandas
requirements.txt
:<code>openai==1.12.0 python-dotenv==1.0.0 numpy==1.24.3 pandas==2.1.0</code>
.env
):<code>OPENAI_API_KEY=your_api_key_here</code>
ステップ3:テキスト処理(src/document_loader.py
):
<code class="language-python">import os from typing import List class DocumentLoader: def __init__(self, documents_path: str): self.documents_path = documents_path def load_documents(self) -> List[str]: documents = [] for filename in os.listdir(self.documents_path): if filename.endswith('.txt'): with open(os.path.join(self.documents_path, filename), 'r') as file: documents.append(file.read()) return documents</code>
ステップ4:埋め込み作成(src/text_processor.py
):
<code class="language-python">from typing import List class TextProcessor: def __init__(self, chunk_size: int = 1000): self.chunk_size = chunk_size def split_into_chunks(self, text: str) -> List[str]: words = text.split() chunks = [] current_chunk = [] current_size = 0 for word in words: if current_size + len(word) > self.chunk_size: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_size = len(word) else: current_chunk.append(word) current_size += len(word) + 1 if current_chunk: chunks.append(' '.join(current_chunk)) return chunks</code>
ステップ5:検索システム(src/embeddings_manager.py
):
<code class="language-python">from typing import List import openai import numpy as np class EmbeddingsManager: def __init__(self, api_key: str): openai.api_key = api_key def create_embeddings(self, texts: List[str]) -> List[np.ndarray]: embeddings = [] for text in texts: response = openai.embeddings.create( model="text-embedding-ada-002", input=text ) embeddings.append(np.array(response.data[0].embedding)) return embeddings</code>
ステップ6:openai Integration(src/retrieval_system.py
):
<code class="language-python">import numpy as np from typing import List, Tuple class RetrievalSystem: def __init__(self, chunks: List[str], embeddings: List[np.ndarray]): self.chunks = chunks self.embeddings = embeddings def find_similar_chunks(self, query_embedding: np.ndarray, top_k: int = 3) -> List[Tuple[str, float]]: similarities = [] for i, embedding in enumerate(self.embeddings): similarity = np.dot(query_embedding, embedding) / ( np.linalg.norm(query_embedding) * np.linalg.norm(embedding) ) similarities.append((self.chunks[i], similarity)) return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]</code>
ステップ7:システムの使用(src/rag_system.py
):
<code class="language-python">import os from dotenv import load_dotenv from typing import List import openai from .document_loader import DocumentLoader from .text_processor import TextProcessor from .embeddings_manager import EmbeddingsManager from .retrieval_system import RetrievalSystem class RAGSystem: def __init__(self): load_dotenv() self.api_key = os.getenv('OPENAI_API_KEY') self.loader = DocumentLoader('data/documents') self.processor = TextProcessor() self.embeddings_manager = EmbeddingsManager(self.api_key) # Initialize system self.initialize_system() def initialize_system(self): # Load and process documents documents = self.loader.load_documents() self.chunks = [] for doc in documents: self.chunks.extend(self.processor.split_into_chunks(doc)) # Create embeddings self.embeddings = self.embeddings_manager.create_embeddings(self.chunks) # Initialize retrieval system self.retrieval_system = RetrievalSystem(self.chunks, self.embeddings) def answer_question(self, question: str) -> str: # Get question embedding question_embedding = self.embeddings_manager.create_embeddings([question])[0] # Get relevant chunks relevant_chunks = self.retrieval_system.find_similar_chunks(question_embedding) # Prepare context context = "\n".join([chunk[0] for chunk in relevant_chunks]) # Create prompt prompt = f"""Context: {context}\n\nQuestion: {question}\n\nAnswer:""" # Get response from OpenAI response = openai.chat.completions.create( model="gpt-4-turbo-preview", messages=[ {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer the question."}, {"role": "user", "content": prompt} ] ) return response.choices[0].message.content</code>
ドキュメント。 次に、test.py
:を実行します
.txt
結論:data/documents
test.py
以上がPythonとOpenaiで最初のぼろきシステムを構築しますの詳細内容です。詳細については、PHP 中国語 Web サイトの他の関連記事を参照してください。