Algorithm


Problem Name: 393. UTF-8 Validation

Given an integer array data representing the data, return whether it is a valid UTF-8 encoding (i.e. it translates to a sequence of valid UTF-8 encoded characters).

A character in UTF8 can be from 1 to 4 bytes long, subjected to the following rules:

  1. For a 1-byte character, the first bit is a 0, followed by its Unicode code.
  2. For an n-bytes character, the first n bits are all one's, the n + 1 bit is 0, followed by n - 1 bytes with the most significant 2 bits being 10.

This is how the UTF-8 encoding would work:

     Number of Bytes   |        UTF-8 Octet Sequence
                       |              (binary)
   --------------------+-----------------------------------------
            1          |   0xxxxxxx
            2          |   110xxxxx 10xxxxxx
            3          |   1110xxxx 10xxxxxx 10xxxxxx
            4          |   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

x denotes a bit in the binary form of a byte that may be either 0 or 1.

Note: The input is an array of integers. Only the least significant 8 bits of each integer is used to store the data. This means each integer represents only 1 byte of data.

 

Example 1:

Input: data = [197,130,1]
Output: true
Explanation: data represents the octet sequence: 11000101 10000010 00000001.
It is a valid utf-8 encoding for a 2-bytes character followed by a 1-byte character.

Example 2:

Input: data = [235,140,4]
Output: false
Explanation: data represented the octet sequence: 11101011 10001100 00000100.
The first 3 bits are all one's and the 4th bit is 0 means it is a 3-bytes character.
The next byte is a continuation byte which starts with 10 and that's correct.
But the second continuation byte does not start with 10, so it is invalid.

 

Constraints:

  • 1 <= data.length <= 2 * 104
  • 0 <= data[i] <= 255

Code Examples

#1 Code Example with C Programming

Code - C Programming


bool validUtf8(int* data, int dataSize) {
    int i, n;
    unsigned char k;
    
    n = 0;
    for (i = 0; i  <  dataSize; i ++) {
        k = data[i] & 0xff;
        if (n) {
            if ((k & 0xc0) != 0x80) {
                return false;
            }
            n --;
        } else {
            while (k & 0x80) {
                k = k << 1;
                n ++;
            }
            if (n) {
                if (n == 1 || n > 4) return false;
                n --;
            }
        }
    }
    if (n) return false;
    return true;
}
Copy The Code & Try With Live Editor

Input

x
+
cmd
data = [197,130,1]

Output

x
+
cmd
true

#2 Code Example with Java Programming

Code - Java Programming


class Solution {
  public boolean validUtf8(int[] data) {
    int numberOfBytes = 0;
    for (int i = 0; i  <  data.length; i++) {
      String binaryRep = Integer.toBinaryString(data[i]);
      binaryRep = binaryRep.length() >= 8 ? 
        binaryRep.substring(binaryRep.length() - 8) : 
      "00000000".substring(binaryRep.length() % 8) + binaryRep;
      if (numberOfBytes == 0) {
        for (int j = 0; j  <  binaryRep.length(); j++) {
          if (binaryRep.charAt(j) == '0') {
            break;
          }
          numberOfBytes++;
        }
        if (numberOfBytes == 0) {
          continue;
        }
        if (numberOfBytes > 4 || numberOfBytes == 1) {
          return false;
        } 
      } else {
        if (!(binaryRep.charAt(0) == '1' && binaryRep.charAt(1) == '0')) {
          return false;
        }
      }
      numberOfBytes--;
    }
    return numberOfBytes == 0;
  }
}
Copy The Code & Try With Live Editor

Input

x
+
cmd
data = [197,130,1]

Output

x
+
cmd
true

#3 Code Example with Javascript Programming

Code - Javascript Programming


const validUtf8 = function(data) {
  let count = 0
  for (let c of data) {
    if (count === 0) {
      if (c >> 5 === 0b110) count = 1
      else if (c >> 4 === 0b1110) count = 2
      else if (c >> 3 === 0b11110) count = 3
      else if (c >> 7) return false
    } else {
      if (c >> 6 !== 0b10) return false
      count--
    }
  }
  return count == 0
}
Copy The Code & Try With Live Editor

Input

x
+
cmd
data = [235,140,4]

Output

x
+
cmd
false

#4 Code Example with Python Programming

Code - Python Programming


class Solution:
    def validUtf8(self, data: List[int]) -> bool:
        def rest(i):
            if len(data) < i:
                return False
            for _ in range(i):
                if not data.pop().startswith("10"):
                    return False
            return True

        data, byte = [str(bin(seq)[2:].zfill(8)) for seq in data[::-1]], None
        while data:
            seq = data.pop()
            if seq.startswith("0"):
                continue
            elif seq.startswith("110"):
                if not rest(1):
                    return False
            elif seq.startswith("1110"):
                if not rest(2):
                    return False
            elif seq.startswith("11110"):
                if not rest(3):
                    return False
            else:
                return False
        return True
Copy The Code & Try With Live Editor

Input

x
+
cmd
data = [235,140,4]

Output

x
+
cmd
false

#5 Code Example with C# Programming

Code - C# Programming


namespace LeetCode
{
    public class _0393_UTF8Validation
    {
        public bool ValidUtf8(int[] data)
        {
            int numberOfBytesToProcess = 0;

            int mask1 = 1 << 7;
            int mask2 = 1 << 6;

            foreach (int num in data)
            {
                if (numberOfBytesToProcess == 0)
                {
                    int mask = mask1;
                    while ((mask & num) != 0)
                    {
                        numberOfBytesToProcess++;
                        mask >>= 1;
                    }

                    if (numberOfBytesToProcess == 0) continue;
                    if (numberOfBytesToProcess > 4 || numberOfBytesToProcess == 1) return false;
                }
                else
                {
                    if ((num & mask1) == 0 || (mask2 & num) == 1)
                        return false;
                }

                numberOfBytesToProcess -= 1;
            }

            return numberOfBytesToProcess == 0;
        }
    }
}
Copy The Code & Try With Live Editor

Input

x
+
cmd
data = [197,130,1]

Output

x
+
cmd
true
Advertisements

Demonstration


Previous
#392 Leetcode Is Subsequence Solution in C, C++, Java, JavaScript, Python, C# Leetcode
Next
#394 Leetcode Decode String Solution in C, C++, Java, JavaScript, Python, C# Leetcode