Qdrant向量資料庫練習及Azure AI Embedding API測試
現在AI真的是爆炸式流行,身為工程師的我,自然也會研究一下RAG,不過在做這個題目前,得先學會向量式資料庫,查了一下,大多數人都推Qdrant,所以就練習了一下。
首先,要將資料存入向量資料庫前,得先把文字轉為向量,這需要透過Azure提供的Open API,程式碼如下:
float[] GetEmbedding(string text)
{
//nuget: OpenAI-DotNet
var endpoint = new Uri("https://{your name}.openai.azure.com/");
var apiKey = "{your key}";
var deploymentName = "text-embedding-3-small";
var credential = new AzureKeyCredential(apiKey);
AzureKeyCredential credentials = new(apiKey);
AzureOpenAIClient azureOpenAIClient = new AzureOpenAIClient(endpoint, credentials);
var embeddingClient = azureOpenAIClient.GetEmbeddingClient(deploymentName);
var r = embeddingClient.GenerateEmbedding(text);
var embedding = r.Value;
ReadOnlyMemory<float> vec = embedding.ToFloats();
return vec.ToArray();
}
接下來就是把資料轉為向量並存入資料庫:
async Task InitData()
{
var client = new QdrantClient("localhost", 6334, false, "3065678qazwsx");
string colName = "text_embedding";
if (!await client.CollectionExistsAsync(colName))
{
await client.CreateCollectionAsync(colName,
new VectorParams { Size = 1536, Distance = Distance.Cosine });
}
var count = await client.CountAsync(colName);
List<TextData> data = new List<TextData>
{
new TextData{ catg = "布袋戲", text = "清香白蓮素還真" },
new TextData{ catg = "布袋戲", text = "半神半聖亦半仙,全儒全道是全賢,腦中真書藏萬卷,掌握文武半邊天" },
new TextData{ catg = "布袋戲", text = "霹靂化身最多的首席男主角,溫文儒雅、器宇軒昂、超凡脫俗、武學莫測高深、足智多謀、博學多能、謙虛有禮,處世圓融冷靜、慈悲親和、關懷眾生;以武林和平、天下大同為己任,『謀為天下謀、利為天下利』 - 無我、無為!為武林風塵默默承受一切,多次以絕頂智慧化解災厄,置之死地而後生,為天下蒼生應現各種精彩玄奇的身份!幽默風趣的隨機教化,難捨能捨、忍辱負重、不計毀謗、無怨無悔,默默付出,不遺餘力、不求回報,真乃具足大慈悲與大智慧的凡聖一體,反璞歸真『素還真』。" },
new TextData{ catg = "布袋戲", text = "日月星三才子" },
new TextData{ catg = "詩詞", text = "床前明月光,疑是地上霜。舉頭望明月,低頭思故鄉。"},
new TextData{ catg = "詩詞", text = "春眠不覺曉,處處聞啼鳥。夜來風雨聲,花落知多少。"},
new TextData{ catg = "詩詞", text = "滿室天香仙子家,一琴一劍一杯茶。羽衣常帶煙霞色,不惹人間桃李花。"},
};
var points = data.Select(t =>
{
var catg = t.catg;
var text = t.text;
PointId id = new PointId();
id.Uuid = Guid.NewGuid().ToString();
return new PointStruct
{
Id = id,
Vectors = GetEmbedding(text),
Payload =
{
["catg"] = catg,
["text"] = text
}
};
}).ToList();
var updateResult = await client.UpsertAsync(colName, points);
}
最後就是查詢的部份:
var client = new QdrantClient("localhost", 6334, false, "3065678qazwsx");
string colName = "text_embedding";
//查詢語句轉換為向量
string keywd = "請隨機給我一首詩";
var queryVector = GetEmbedding(keywd);
Console.ForegroundColor = ConsoleColor.Cyan;
Console.WriteLine($"# 查詢:{keywd}");
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"(向量相似度, TOP 3)");
Console.ResetColor();
var top3 = await client.SearchAsync(
colName,
queryVector,
limit: 3);
int rank = 1;
foreach (var p in top3)
{
Console.WriteLine($"{rank++}. {p.Score:n4} {p.Payload["catg"].StringValue} {p.Payload["text"].StringValue}");
}
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"(向量相似度 + 限定詩布袋戲, TOP 3)");
Console.ResetColor();
queryVector = GetEmbedding("素還真是誰");
var top3poem = await client.SearchAsync(
colName,
queryVector,
filter: Conditions.MatchText("catg", "布袋戲"),
limit: 3);
rank = 1;
foreach (var p in top3poem)
{
Console.WriteLine($"{rank++}. {p.Score:n4} {p.Payload["text"].StringValue}");
}
執行結果:
參考資料