2020-09-30 19:29:21 +02:00
|
|
|
--- This library provides basic support for UTF-8 encoding.
|
|
|
|
-- @module utf8
|
|
|
|
|
|
|
|
local utf8 = {}
|
2016-06-19 12:36:54 +02:00
|
|
|
|
|
|
|
---
|
|
|
|
-- Receives zero or more integers, converts each one to its corresponding UTF-8 byte sequence and returns
|
|
|
|
-- a string with the concatenation of all these sequences.
|
2020-09-30 19:29:21 +02:00
|
|
|
function utf8.char (...) end
|
2016-06-19 12:36:54 +02:00
|
|
|
|
|
|
|
---
|
|
|
|
-- The pattern "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" , which matches exactly one
|
|
|
|
-- UTF-8 byte sequence, assuming that the subject is a valid UTF-8 string.
|
2020-09-30 19:29:21 +02:00
|
|
|
-- @field charpattern
|
2016-06-19 12:36:54 +02:00
|
|
|
|
|
|
|
---
|
2020-09-30 19:29:21 +02:00
|
|
|
-- Iterate over all characters in string.
|
|
|
|
--
|
2016-06-19 12:36:54 +02:00
|
|
|
-- for p, c in utf8.codes(s) do body end
|
2020-09-30 19:29:21 +02:00
|
|
|
--
|
2016-06-19 12:36:54 +02:00
|
|
|
-- will iterate over all characters in string s, with p being the position (in bytes) and c the code point
|
2020-09-30 19:29:21 +02:00
|
|
|
-- of each character. It raises an error if it meets any invalid byte sequence.
|
|
|
|
function utf8.codes (s) end
|
|
|
|
|
2016-06-19 12:36:54 +02:00
|
|
|
---
|
2020-09-30 19:29:21 +02:00
|
|
|
-- Returns the codepoints (as integers) from all characters in s that start between byte position i and j (both included).
|
2016-06-19 12:36:54 +02:00
|
|
|
-- The default for i is 1 and for j is i. It raises an error if it meets any invalid byte sequence.
|
2020-09-30 19:29:21 +02:00
|
|
|
function utf8.codepoint (s [, i [, j]]) end
|
2016-06-19 12:36:54 +02:00
|
|
|
|
|
|
|
---
|
2020-09-30 19:29:21 +02:00
|
|
|
-- Returns the number of UTF-8 characters in string s that start between positions i and j (both inclusive).
|
2016-06-19 12:36:54 +02:00
|
|
|
-- The default for i is 1 and for j is -1. If it finds any invalid byte sequence, returns a false value plus
|
2020-09-30 19:29:21 +02:00
|
|
|
-- the position of the first invalid byte.
|
|
|
|
function utf8.len (s [, i [, j]]) end
|
2016-06-19 12:36:54 +02:00
|
|
|
|
|
|
|
---
|
|
|
|
-- Returns the position (in bytes) where the encoding of the n-th character of s (counting from position i) starts.
|
2020-09-30 19:29:21 +02:00
|
|
|
-- A negative n gets characters before position i. The default for i is 1 when n is non-negative
|
2016-06-19 12:36:54 +02:00
|
|
|
-- and #s + 1 otherwise, so that utf8.offset(s, -n) gets the offset of the n-th character from the end
|
|
|
|
-- of the string.
|
|
|
|
-- If the specified character is neither in the subject nor right after its end, the function returns nil.
|
2020-09-30 19:29:21 +02:00
|
|
|
--
|
|
|
|
-- As a special case, when n is 0 the function returns the start of the encoding of the character that contains the i-th byte of s.
|
|
|
|
--
|
|
|
|
-- This function assumes that s is a valid UTF-8 string.
|
|
|
|
function utf8.offset (s, n [, i]) end
|