nix_utf8_decode

Decodes UTF-8 text into Unicode codepoints
git clone https://0xdd.org/code/nix_utf8_decode.git
Log | Files | Refs | README | LICENSE

main.c (1577B)


      1 /*
      2 2018 David DiPaola
      3 licensed under CC0 (public domain, see https://creativecommons.org/publicdomain/zero/1.0/)
      4 */
      5 
      6 #include <unistd.h>
      7 
      8 #include <stdio.h>
      9 
     10 static void
     11 _print_codepoint(unsigned int codepoint) {
     12 	if (codepoint <= 0xFFFF) {
     13 		printf("U+%04X" "\n", codepoint);
     14 	}
     15 	else if (codepoint <= 0xFFFFF) {
     16 		printf("U+%05X" "\n", codepoint);
     17 	}
     18 	else if (codepoint <= 0x10FFFF) {
     19 		printf("U+%06X" "\n", codepoint);
     20 	}
     21 }
     22 
     23 int
     24 main() {
     25 	unsigned char byte;
     26 	unsigned int  remaining = 0;
     27 	unsigned int  codepoint = 0;
     28 	size_t        offset    = 0;
     29 	while (read(STDIN_FILENO, &byte, sizeof(byte)) == sizeof(byte)) {
     30 		if (remaining == 0) {
     31 			if (byte <= 0x7F) {
     32 				_print_codepoint(byte);
     33 				codepoint = 0;
     34 				remaining = 0;
     35 			}
     36 			else if (((byte >> 5) & 0b111) == 0b110) {
     37 				codepoint = byte & 0b11111;
     38 				remaining = 1;
     39 			}
     40 			else if (((byte >> 4) & 0b1111) == 0b1110) {
     41 				codepoint = byte & 0b1111;
     42 				remaining = 2;
     43 			}
     44 			else if (((byte >> 3) & 0b11111) == 0b11110) {
     45 				codepoint = byte & 0b111;
     46 				remaining = 3;
     47 			}
     48 			else {
     49 				fprintf(stderr, "ERROR at offset 0x%zX: invalid start byte: 0x%02X" "\n", offset, byte);
     50 				codepoint = 0;
     51 				remaining = 0;
     52 			}
     53 		}
     54 		else {
     55 			if (((byte >> 6) & 0b11) == 0b10) {
     56 				codepoint = (codepoint << 6) | (byte & 0b111111);
     57 				remaining--;
     58 
     59 				if (remaining == 0) {
     60 					_print_codepoint(codepoint);
     61 				}
     62 			}
     63 			else {
     64 				fprintf(stderr, "ERROR at offset 0x%zX: invalid continuation byte: 0x%02X" "\n", offset, byte);
     65 				codepoint = 0;
     66 				remaining = 0;
     67 			}
     68 		}
     69 
     70 		offset++;
     71 	}
     72 
     73 	return 0;
     74 }
     75