現在AI真的是爆炸式流行,身為工程師的我,自然也會研究一下RAG,不過在做這個題目前,得先學會向量式資料庫,查了一下,大多數人都推Qdrant,所以就練習了一下。

首先,要將資料存入向量資料庫前,得先把文字轉為向量,這需要透過Azure提供的Open API,程式碼如下:

float[] GetEmbedding(string text)
{
    //nuget: OpenAI-DotNet
    var endpoint = new Uri("https://{your name}.openai.azure.com/");
    var apiKey = "{your key}";
    var deploymentName = "text-embedding-3-small";

    var credential = new AzureKeyCredential(apiKey);
    AzureKeyCredential credentials = new(apiKey);

    AzureOpenAIClient azureOpenAIClient = new AzureOpenAIClient(endpoint, credentials);
    var embeddingClient = azureOpenAIClient.GetEmbeddingClient(deploymentName);

    var r = embeddingClient.GenerateEmbedding(text);
    var embedding = r.Value;
    ReadOnlyMemory<float> vec = embedding.ToFloats();
    return vec.ToArray();
}

接下來就是把資料轉為向量並存入資料庫:

async Task InitData()
{
    var client = new QdrantClient("localhost", 6334, false, "3065678qazwsx");
    string colName = "text_embedding";

    if (!await client.CollectionExistsAsync(colName))
    {
        await client.CreateCollectionAsync(colName,
            new VectorParams { Size = 1536, Distance = Distance.Cosine });
    }

    var count = await client.CountAsync(colName);


    List<TextData> data = new List<TextData>
    {
        new TextData{ catg = "布袋戲", text = "清香白蓮素還真" },
        new TextData{ catg = "布袋戲", text = "半神半聖亦半仙,全儒全道是全賢,腦中真書藏萬卷,掌握文武半邊天" },
        new TextData{ catg = "布袋戲", text = "霹靂化身最多的首席男主角,溫文儒雅、器宇軒昂、超凡脫俗、武學莫測高深、足智多謀、博學多能、謙虛有禮,處世圓融冷靜、慈悲親和、關懷眾生;以武林和平、天下大同為己任,『謀為天下謀、利為天下利』 - 無我、無為!為武林風塵默默承受一切,多次以絕頂智慧化解災厄,置之死地而後生,為天下蒼生應現各種精彩玄奇的身份!幽默風趣的隨機教化,難捨能捨、忍辱負重、不計毀謗、無怨無悔,默默付出,不遺餘力、不求回報,真乃具足大慈悲與大智慧的凡聖一體,反璞歸真『素還真』。" },
        new TextData{ catg = "布袋戲", text = "日月星三才子" },
        new TextData{ catg = "詩詞", text = "床前明月光,疑是地上霜。舉頭望明月,低頭思故鄉。"},
        new TextData{ catg = "詩詞", text = "春眠不覺曉,處處聞啼鳥。夜來風雨聲,花落知多少。"},
        new TextData{ catg = "詩詞", text = "滿室天香仙子家,一琴一劍一杯茶。羽衣常帶煙霞色,不惹人間桃李花。"},
    };

    var points = data.Select(t =>
    {
        var catg = t.catg;
        var text = t.text;
        PointId id = new PointId();
        id.Uuid = Guid.NewGuid().ToString();
        return new PointStruct
        {

            Id = id,
            Vectors = GetEmbedding(text),
            Payload =
            {
                    ["catg"] = catg,
                    ["text"] = text
            }
        };
    }).ToList();
    var updateResult = await client.UpsertAsync(colName, points);
}

最後就是查詢的部份:

var client = new QdrantClient("localhost", 6334, false, "3065678qazwsx");
string colName = "text_embedding";

//查詢語句轉換為向量
string keywd = "請隨機給我一首詩";
var queryVector = GetEmbedding(keywd);

Console.ForegroundColor = ConsoleColor.Cyan;
Console.WriteLine($"# 查詢:{keywd}");
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"(向量相似度, TOP 3)");
Console.ResetColor();

var top3 = await client.SearchAsync(
  colName,
  queryVector,
  limit: 3);

int rank = 1;
foreach (var p in top3)
{
    Console.WriteLine($"{rank++}. {p.Score:n4} {p.Payload["catg"].StringValue} {p.Payload["text"].StringValue}");
}

Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"(向量相似度 + 限定詩布袋戲, TOP 3)");
Console.ResetColor();

queryVector = GetEmbedding("素還真是誰");
var top3poem = await client.SearchAsync(
  colName,
  queryVector,
  filter: Conditions.MatchText("catg", "布袋戲"),
  limit: 3);
rank = 1;
foreach (var p in top3poem)
{
    Console.WriteLine($"{rank++}. {p.Score:n4} {p.Payload["text"].StringValue}");
}

執行結果:

參考資料