Sentence-BERT

Posted on 2025/08/09, 11:11 AM By admin22

長文の文書構造、単語の関係性を解析する手段をClaudeで調べていたところ、Sentence-BERTというキーワードがでてきたため、これを掘り下げて、具体的に実装してみました。
説明はコードのコメントでわかると思いますが、Claudeで、出力したコードを編集して、文章の類似度の距離や検索をテストしました。

Neo4Jは下記の環境
https://decode.red/net/archives/2198

インストール
pip install sentence-transformers
pip install matplotlib
pip install seaborn
pip install neo4j
pip install japanize-matplotlib
pip install faiss-gpu-cu12

from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import japanize_matplotlib

import seaborn as sns

import faiss

documents = [
    "機械学習はAIの基本技術です",
    "機械学習はデータから自動的にパターンを学習する技術です",
    "深層学習はニューラルネットワークを多層化した手法です",
    "自然言語処理は人間の言語を計算機で処理する技術です",
    "今日は天気が良く、散歩日和です",
    "明日は雨が降る予報が出ています",
    "桜の花が美しく咲いています",
    "データサイエンスは統計学と計算機科学の融合分野です",
    "ビッグデータの解析には高性能な計算機が必要です"
]

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# 文埋め込み生成
embeddings = model.encode(documents)

class DocumentGraphBuilder:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def close(self):
        self.driver.close()
    
    def create_document_nodes(self, documents):
        """文書ノードを作成"""
        with self.driver.session() as session:
            for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
                session.run(
                    "CREATE (d:Document {id: $id, text: $text, embedding: $embedding})",
                    id=i, text=doc, embedding=embedding.tolist()
                )
    
    def create_similarity_edges(self, threshold=0.7):
        """類似性に基づいてエッジを作成"""
        with self.driver.session() as session:
            # 全文書ペアの類似性を計算してエッジ作成
            result = session.run("MATCH (d1:Document), (d2:Document) WHERE id(d1) < id(d2) RETURN d1, d2")
            
            for record in result:
                d1_embedding = np.array(record["d1"]["embedding"])
                d2_embedding = np.array(record["d2"]["embedding"])
                
                similarity = cosine_similarity([d1_embedding], [d2_embedding])[0][0]
                
                if similarity > threshold:
                    session.run(
                        """
                        MATCH (d1:Document {id: $id1}), (d2:Document {id: $id2})
                        CREATE (d1)-[:SIMILAR {score: $score}]->(d2)
                        """,
                        id1=record["d1"]["id"], 
                        id2=record["d2"]["id"], 
                        score=similarity
                    )

graph_builder = DocumentGraphBuilder("bolt://192.168.0.180:7687", "neo4j", "password")
graph_builder.create_document_nodes(documents)
graph_builder.create_similarity_edges(threshold=0.8)
graph_builder.close()


print(f"埋め込みの形状: {embeddings.shape}")

# クラスタリング
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(embeddings)
    
# 次元削減と可視化
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)
    
#plt.figure(figsize=(12, 8))
plt.figure(figsize=(10, 6))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                         c=clusters, cmap='viridis')
plt.colorbar(scatter)
    
# 各点にテキストラベルを追加
for i, doc in enumerate(documents):
    plt.annotate(f"{i}: {doc[:20]}...", 
                (embeddings_2d[i, 0], embeddings_2d[i, 1]),
                xytext=(5, 5), textcoords='offset points', fontsize=8)
    
plt.title('文書クラスタリング結果')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


# 類似性行列の計算
similarity_matrix = cosine_similarity(embeddings)
    
# 結果の可視化
plt.figure(figsize=(10, 6))
sns.heatmap(similarity_matrix, 
            xticklabels=range(len(documents)),
            yticklabels=range(len(documents)),
            annot=True, cmap='coolwarm')
plt.title('文章間の類似性行列')
plt.show()



class FastSimilaritySearch:
    def __init__(self):
        self.index = None
    
    def build_index(self, documents):
        """文書インデックスを構築"""
        # FAISS インデックス構築
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # 内積ベース
        
        # 正規化（コサイン類似度のため）
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings.astype('float32'))
        
        print(f"インデックスに {len(documents)} 文書を追加しました")
    
    def search(self, query, k=5):
        """クエリに対する類似文書検索"""
        if self.index is None:
            raise ValueError("まずbuild_index()を実行してください")
        
        # クエリの埋め込み生成
        query_embedding = model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # 検索実行
        scores, indices = self.index.search(query_embedding.astype('float32'), k)
        
        # 結果整理
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # 有効なインデックス
                results.append({
                    'document': documents[idx],
                    'score': score,
                    'index': idx
                })
        
        return results

search_engine = FastSimilaritySearch()
search_engine.build_index(documents)
query = "AIと機械学習について"
results = search_engine.search(query, k=3)
    
print(f"クエリ: {query}")
print("検索結果:")
for i, result in enumerate(results):
    print(f"{i+1}. スコア: {result['score']:.3f}")
    print(f"   文書: {result['document']}")
    print()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

from neo4j import GraphDatabase

from sentence_transformers import SentenceTransformer

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import japanize_matplotlib

import seaborn as sns

import faiss

documents = [

"機械学習はAIの基本技術です",

"機械学習はデータから自動的にパターンを学習する技術です",

"深層学習はニューラルネットワークを多層化した手法です",

"自然言語処理は人間の言語を計算機で処理する技術です",

"今日は天気が良く、散歩日和です",

"明日は雨が降る予報が出ています",

"桜の花が美しく咲いています",

"データサイエンスは統計学と計算機科学の融合分野です",

"ビッグデータの解析には高性能な計算機が必要です"

]

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# 文埋め込み生成

embeddings = model.encode(documents)

class DocumentGraphBuilder:

def __init__(self, uri, user, password):

self.driver = GraphDatabase.driver(uri, auth=(user, password))

def close(self):

self.driver.close()

def create_document_nodes(self, documents):

"""文書ノードを作成"""

with self.driver.session() as session:

for i, (doc, embedding) in enumerate(zip(documents, embeddings)):

session.run(

"CREATE (d:Document {id: $id, text: $text, embedding: $embedding})",

id=i, text=doc, embedding=embedding.tolist()

)

def create_similarity_edges(self, threshold=0.7):

"""類似性に基づいてエッジを作成"""

with self.driver.session() as session:

# 全文書ペアの類似性を計算してエッジ作成

result = session.run("MATCH (d1:Document), (d2:Document) WHERE id(d1) < id(d2) RETURN d1, d2")

for record in result:

d1_embedding = np.array(record["d1"]["embedding"])

d2_embedding = np.array(record["d2"]["embedding"])

similarity = cosine_similarity([d1_embedding], [d2_embedding])[0][0]

if similarity > threshold:

session.run(

"""

MATCH (d1:Document {id: $id1}), (d2:Document {id: $id2})

CREATE (d1)-[:SIMILAR {score: $score}]->(d2)

""",

id1=record["d1"]["id"],

id2=record["d2"]["id"],

score=similarity

)

graph_builder = DocumentGraphBuilder("bolt://192.168.0.180:7687", "neo4j", "password")

graph_builder.create_document_nodes(documents)

graph_builder.create_similarity_edges(threshold=0.8)

graph_builder.close()

print(f"埋め込みの形状: {embeddings.shape}")

# クラスタリング

kmeans = KMeans(n_clusters=3, random_state=42)

clusters = kmeans.fit_predict(embeddings)

# 次元削減と可視化

pca = PCA(n_components=2)

embeddings_2d = pca.fit_transform(embeddings)

#plt.figure(figsize=(12, 8))

plt.figure(figsize=(10, 6))

scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],

c=clusters, cmap='viridis')

plt.colorbar(scatter)

# 各点にテキストラベルを追加

for i, doc in enumerate(documents):

plt.annotate(f"{i}: {doc[:20]}...",

(embeddings_2d[i, 0], embeddings_2d[i, 1]),

xytext=(5, 5), textcoords='offset points', fontsize=8)

plt.title('文書クラスタリング結果')

plt.xlabel('PCA Component 1')

plt.ylabel('PCA Component 2')

plt.show()

# 類似性行列の計算

similarity_matrix = cosine_similarity(embeddings)

# 結果の可視化

plt.figure(figsize=(10, 6))

sns.heatmap(similarity_matrix,

xticklabels=range(len(documents)),

yticklabels=range(len(documents)),

annot=True, cmap='coolwarm')

plt.title('文章間の類似性行列')

plt.show()

class FastSimilaritySearch:

def __init__(self):

self.index = None

def build_index(self, documents):

"""文書インデックスを構築"""

# FAISS インデックス構築

dimension = embeddings.shape[1]

self.index = faiss.IndexFlatIP(dimension) # 内積ベース

# 正規化（コサイン類似度のため）

faiss.normalize_L2(embeddings)

self.index.add(embeddings.astype('float32'))

print(f"インデックスに {len(documents)} 文書を追加しました")

def search(self, query, k=5):

"""クエリに対する類似文書検索"""

if self.index is None:

raise ValueError("まずbuild_index()を実行してください")

# クエリの埋め込み生成

query_embedding = model.encode([query])

faiss.normalize_L2(query_embedding)

# 検索実行

scores, indices = self.index.search(query_embedding.astype('float32'), k)

# 結果整理

results = []

for score, idx in zip(scores[0], indices[0]):

if idx != -1: # 有効なインデックス

results.append({

'document': documents[idx],

'score': score,

'index': idx

})

return results

search_engine = FastSimilaritySearch()

search_engine.build_index(documents)

query = "AIと機械学習について"

results = search_engine.search(query, k=3)

print(f"クエリ: {query}")

print("検索結果:")

for i, result in enumerate(results):

print(f"{i+1}. スコア: {result['score']:.3f}")

print(f" 文書: {result['document']}")

print()

上書きされるので、実行のたびに削除

MATCH (n) DETACH DELETE n

このような学ぶ道具としてのAIの活用は、本当に便利に感じます。
これだったらどんな難解なコーディングでもいけそうな気がしてきます。

About admin22

Categories: 未分類タグ: LLM

About admin22

Recent Posts

Tag

Links

Archive