現在AI算是當紅話題,很多工程師都在瘋狂學習,深怕死在沙灘上,包括我也是。而串接所謂的LLM,除了雲端上的服務,例如OpenAI的ChapGPT API,也有直接在本機執行的方式,本篇文章是示範如何在本機執行微軟的開源模型:Phi-3。

Phi-3有3個版本:

  • Phi-3-mini
  • Phi-3-small
  • Phi-3-medium

因為筆者的筆電顯卡不夠力,所以只能跑mini版本。

模式的下載網址如下:

Phi-3-mini

接下來簡單提一下如何使用模型:

Response = string.Empty;
string systemPrompt = "You are a knowledgeable and friendly assistant. Answer the following question as clearly and concisely as possible, providing any relevant information and examples.";
string userPrompt = Prompt;
var tokenizer = new Tokenizer(_model);

// combine system prompt and user prompt
var fullPrompt = $"<|system|>{systemPrompt}<|end|><|user|>{userPrompt}<|end|><|assistant|>";
var tokens = tokenizer.Encode(fullPrompt);

var generatorParams = new GeneratorParams(_model);
generatorParams.SetSearchOption("max_length", 2048);
//generatorParams.SetSearchOption("temperature", 0.3);
generatorParams.SetInputSequences(tokens);

上面是組參數,其中提示詞有system跟user兩種,system可當作本機伺服器這裡的設定,簡單說就是告訴AI要扮演的角色(律師或工程師等),以及調整語氣。user提示詞就是使用者所問的問題,後面GeneratorParams就是參數,而max_length是回答的上限值,詳細的參數可參考這篇:

https://onnxruntime.ai/docs/genai/reference/config.html

接下來就是送參數及收回應,其中Handler是讓外部設定的event,由於我是用WinForm程式做測試,所以需要一個Handler來做介面上的更新。

StringBuilder response = new StringBuilder();
var generator = new Generator(_model, generatorParams);
while (!generator.IsDone())
{
    generator.ComputeLogits();
    generator.GenerateNextToken();
    var outputTokens = generator.GetSequence(0);
    var newToken = outputTokens.Slice(outputTokens.Length - 1, 1);
    var output = tokenizer.Decode(newToken);
    response.Append(output);
    Response = response.ToString();
    Handler();
}

最後是完整程式碼:

public class LLMHelper
{
    private string _modelPath = string.Empty;

    private Model? _model;

    public LLMHelper(string path)
    {
        if(string.IsNullOrEmpty(path))
        {
            throw new ArgumentNullException("path is invalid.");
        }
        SetModelPath(path);
    }

    public LLMHelper()
    {
        SetModelPath("D:\\LLM\\onnx\\Phi-3-mini-4k-instruct-onnx\\cuda\\cuda-fp16");
        //SetModelPath("D:\\LLM\\onnx\\Phi-3-small-8k-instruct-onnx-cuda\\cuda-int4-rtn-block-32");
        //SetModelPath("D:\\LLM\\onnx\\Phi-3-medium-128k-instruct-onnx-cuda\\cuda-fp16");
    }

    private void SetModelPath(string modelPath)
    {
        _modelPath = modelPath;
    }

    public void Load()
    {
        _model = new Model(_modelPath);
    }

    public string SendPrompt(string prompt)
    {
        string systemPrompt = "You are a knowledgeable and friendly assistant. Answer the following question as clearly and concisely as possible, providing any relevant information and examples.";
        string userPrompt = prompt;
        var tokenizer = new Tokenizer(_model);

        // combine system prompt and user prompt
        var fullPrompt = $"<|system|>{systemPrompt}<|end|><|user|>{userPrompt}<|end|><|assistant|>";
        var tokens = tokenizer.Encode(fullPrompt);

        var generatorParams = new GeneratorParams(_model);
        generatorParams.SetSearchOption("max_length", 2048);
        generatorParams.SetInputSequences(tokens);

        StringBuilder response = new StringBuilder();
        var generator = new Generator(_model, generatorParams);
        while (!generator.IsDone())
        {
            generator.ComputeLogits();
            generator.GenerateNextToken();
            var outputTokens = generator.GetSequence(0);
            var newToken = outputTokens.Slice(outputTokens.Length - 1, 1);
            var output = tokenizer.Decode(newToken);
            response.Append(output);
        }
        return response.ToString();
    }

    public string Prompt { get; set; } = string.Empty;
    public string Response { get; set; } = string.Empty;

    public Action? Handler { get; set; }

    public void SendPromptForThread()
    {
        if(Handler == null)
        {
            throw new NullReferenceException("not setting handler");
        }

        Response = string.Empty;
        string systemPrompt = "You are a knowledgeable and friendly assistant. Answer the following question as clearly and concisely as possible, providing any relevant information and examples.";
        string userPrompt = Prompt;
        var tokenizer = new Tokenizer(_model);

        // combine system prompt and user prompt
        var fullPrompt = $"<|system|>{systemPrompt}<|end|><|user|>{userPrompt}<|end|><|assistant|>";
        var tokens = tokenizer.Encode(fullPrompt);

        var generatorParams = new GeneratorParams(_model);
        generatorParams.SetSearchOption("max_length", 2048);
        generatorParams.SetInputSequences(tokens);

        StringBuilder response = new StringBuilder();
        var generator = new Generator(_model, generatorParams);
        while (!generator.IsDone())
        {
            generator.ComputeLogits();
            generator.GenerateNextToken();
            var outputTokens = generator.GetSequence(0);
            var newToken = outputTokens.Slice(outputTokens.Length - 1, 1);
            var output = tokenizer.Decode(newToken);
            response.Append(output);
            Response = response.ToString();
            Handler();
        }
    }

    public void ThreadStart()
    {
        Thread thread = new Thread(SendPromptForThread);
        thread.IsBackground = true;
        thread.Start();
    }
}

另外程式碼已放到 GitHub 上了。

參考資料