Artikel ini meneroka algoritma HybridSimilarity, rangkaian saraf canggih yang direka untuk menilai persamaan antara pasangan teks. Model hibrid ini dengan bijak menyepadukan perbandingan leksikal, fonetik, semantik dan sintaksis untuk skor persamaan yang menyeluruh.
import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sentence_transformers import SentenceTransformer from Levenshtein import ratio as levenshtein_ratio from phonetics import metaphone import torch import torch.nn as nn class HybridSimilarity(nn.Module): def __init__(self): super().__init__() self.bert = SentenceTransformer('all-MiniLM-L6-v2') self.tfidf = TfidfVectorizer() self.attention = nn.MultiheadAttention(embed_dim=384, num_heads=4) self.fc = nn.Sequential( nn.Linear(1152, 256), nn.ReLU(), nn.LayerNorm(256), nn.Linear(256, 1), nn.Sigmoid() ) def _extract_features(self, text1, text2): # Feature Extraction features = {} # Lexical Analysis features['levenshtein'] = levenshtein_ratio(text1, text2) features['jaccard'] = len(set(text1.split()) & set(text2.split())) / len(set(text1.split()) | set(text2.split())) # Phonetic Analysis features['metaphone'] = 1.0 if metaphone(text1) == metaphone(text2) else 0.0 # Semantic Analysis (BERT) emb1 = self.bert.encode(text1, convert_to_tensor=True) emb2 = self.bert.encode(text2, convert_to_tensor=True) features['semantic_cosine'] = nn.CosineSimilarity()(emb1, emb2).item() # Syntactic Analysis (LSA-TFIDF) tfidf_matrix = self.tfidf.fit_transform([text1, text2]) svd = TruncatedSVD(n_components=1) lsa = svd.fit_transform(tfidf_matrix) features['lsa_cosine'] = np.dot(lsa[0], lsa[1].T)[0][0] # Attention Mechanism att_output, _ = self.attention( emb1.unsqueeze(0).unsqueeze(0), emb2.unsqueeze(0).unsqueeze(0), emb2.unsqueeze(0).unsqueeze(0) ) features['attention_score'] = att_output.mean().item() return torch.tensor(list(features.values())).unsqueeze(0) def forward(self, text1, text2): features = self._extract_features(text1, text2) return self.fc(features).item() def similarity_coefficient(text1, text2): model = HybridSimilarity() return model(text1, text2)
Model HybridSimilarity bergantung pada komponen utama ini:
Kelas HybridSimilarity
, melanjutkan nn.Module
, memulakan:
all-MiniLM-L6-v2
).self.bert = SentenceTransformer('all-MiniLM-L6-v2') self.tfidf = TfidfVectorizer() self.attention = nn.MultiheadAttention(embed_dim=384, num_heads=4) self.fc = nn.Sequential( nn.Linear(1152, 256), nn.ReLU(), nn.LayerNorm(256), nn.Linear(256, 1), nn.Sigmoid() )
Kaedah _extract_features
mengira beberapa ciri persamaan:
features['levenshtein'] = levenshtein_ratio(text1, text2) features['jaccard'] = len(set(text1.split()) & set(text2.split())) / len(set(text1.split()) | set(text2.split()))
features['metaphone'] = 1.0 if metaphone(text1) == metaphone(text2) else 0.0
emb1 = self.bert.encode(text1, convert_to_tensor=True) emb2 = self.bert.encode(text2, convert_to_tensor=True) features['semantic_cosine'] = nn.CosineSimilarity()(emb1, emb2).item()
TruncatedSVD
.tfidf_matrix = self.tfidf.fit_transform([text1, text2]) svd = TruncatedSVD(n_components=1) lsa = svd.fit_transform(tfidf_matrix) features['lsa_cosine'] = np.dot(lsa[0], lsa[1].T)[0][0]
att_output, _ = self.attention( emb1.unsqueeze(0).unsqueeze(0), emb2.unsqueeze(0).unsqueeze(0), emb2.unsqueeze(0).unsqueeze(0) ) features['attention_score'] = att_output.mean().item()
Ciri yang diekstrak digabungkan dan dimasukkan ke dalam rangkaian saraf yang bersambung sepenuhnya. Rangkaian ini mengeluarkan skor persamaan (0-1).
import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sentence_transformers import SentenceTransformer from Levenshtein import ratio as levenshtein_ratio from phonetics import metaphone import torch import torch.nn as nn class HybridSimilarity(nn.Module): def __init__(self): super().__init__() self.bert = SentenceTransformer('all-MiniLM-L6-v2') self.tfidf = TfidfVectorizer() self.attention = nn.MultiheadAttention(embed_dim=384, num_heads=4) self.fc = nn.Sequential( nn.Linear(1152, 256), nn.ReLU(), nn.LayerNorm(256), nn.Linear(256, 1), nn.Sigmoid() ) def _extract_features(self, text1, text2): # Feature Extraction features = {} # Lexical Analysis features['levenshtein'] = levenshtein_ratio(text1, text2) features['jaccard'] = len(set(text1.split()) & set(text2.split())) / len(set(text1.split()) | set(text2.split())) # Phonetic Analysis features['metaphone'] = 1.0 if metaphone(text1) == metaphone(text2) else 0.0 # Semantic Analysis (BERT) emb1 = self.bert.encode(text1, convert_to_tensor=True) emb2 = self.bert.encode(text2, convert_to_tensor=True) features['semantic_cosine'] = nn.CosineSimilarity()(emb1, emb2).item() # Syntactic Analysis (LSA-TFIDF) tfidf_matrix = self.tfidf.fit_transform([text1, text2]) svd = TruncatedSVD(n_components=1) lsa = svd.fit_transform(tfidf_matrix) features['lsa_cosine'] = np.dot(lsa[0], lsa[1].T)[0][0] # Attention Mechanism att_output, _ = self.attention( emb1.unsqueeze(0).unsqueeze(0), emb2.unsqueeze(0).unsqueeze(0), emb2.unsqueeze(0).unsqueeze(0) ) features['attention_score'] = att_output.mean().item() return torch.tensor(list(features.values())).unsqueeze(0) def forward(self, text1, text2): features = self._extract_features(text1, text2) return self.fc(features).item() def similarity_coefficient(text1, text2): model = HybridSimilarity() return model(text1, text2)
Fungsi similarity_coefficient
memulakan model dan mengira persamaan antara dua teks input.
self.bert = SentenceTransformer('all-MiniLM-L6-v2') self.tfidf = TfidfVectorizer() self.attention = nn.MultiheadAttention(embed_dim=384, num_heads=4) self.fc = nn.Sequential( nn.Linear(1152, 256), nn.ReLU(), nn.LayerNorm(256), nn.Linear(256, 1), nn.Sigmoid() )
Ini mengembalikan apungan antara 0 dan 1, mewakili persamaan.
Algoritma HybridSimilarity menawarkan pendekatan yang mantap kepada persamaan teks dengan menyepadukan pelbagai aspek perbandingan teks. Gabungan analisis leksikal, fonetik, semantik dan sintaksisnya membolehkan pemahaman yang lebih komprehensif dan bernuansa tentang persamaan teks, menjadikannya sesuai untuk pelbagai aplikasi, termasuk pengesanan pendua, pengelompokan teks dan pengambilan maklumat.
Atas ialah kandungan terperinci Algoritma Kesamaan Hibrid. Untuk maklumat lanjut, sila ikut artikel berkaitan lain di laman web China PHP!