unit uEncoding;

{$mode Delphi}

interface

uses
  SysUtils;

function CodePointToUTF8String(const CodePoint: UInt32): String;

implementation

function Utf8EncodeNumBytes(const Value: UInt32): Integer;
begin
  case Value of
    $0000..$007F: Result := 1;
    $0080..$07FF: Result := 2;
    $0800..$FFFF: Result := 3;
    $10000..$10FFFF: Result := 4;
    otherwise Result := 0;
  end;
end;

function Utf8Encode(const CodePoint: UInt32; Bytes: PByte): Integer;
begin
  case CodePoint of
    $0000..$007F:       // Single byte (i.e. fits in ASCII).
      begin
        Bytes^ := CodePoint and $7F;
        Result := 1;
      end;

    $0080..$07FF:      // Two byte sequence: 110xxxxx 10xxxxxx.
      begin
        Bytes^ := $C0 or ((CodePoint and $7C0) shr 6);
        Inc(Bytes);
        Bytes^ := $80 or (CodePoint and $3F);
        Result := 2;
      end;

    $0800..$FFFF:     // Three byte sequence: 1110xxxx 10xxxxxx 10xxxxxx.
      begin
        Bytes^ := $E0 or ((CodePoint and $F000) shr 12);
        Inc(Bytes);
        Bytes^ := $80 or ((CodePoint and $FC0) shr 6);
        Inc(Bytes);
        Bytes^ := $80 or (CodePoint and $3F);
        Result := 3;
      end;

    $10000..$10FFFF:  // Four byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
      begin
        Bytes^ := $F0 or ((CodePoint and $1C0000) shr 18);
        Inc(Bytes);
        Bytes^ := $80 or ((CodePoint and $3F000) shr 12);
        Inc(Bytes);
        Bytes^ := $80 or ((CodePoint and $FC0) shr 6);
        Inc(Bytes);
        Bytes^ := $80 or (CodePoint and $3F);
        Result := 4;
      end;
    otherwise Result := 0;
  end;
end;

function CodePointToUTF8String(const CodePoint: UInt32): String;
var
  NumBytes, NumWritten: Integer;
  Bytes: PByte;
begin
  NumBytes := Utf8EncodeNumBytes(CodePoint);
  if NumBytes <> 0 then
    begin
      Bytes := PByte(AllocMem(NumBytes));
      NumWritten := Utf8Encode(CodePoint, Bytes);
      SetString(Result, PChar(Bytes), NumWritten);
    end
  else
    Result := '';
end;

end.

