nix_utf8_encode

Encodes Unicode codepoints into UTF-8
git clone https://0xdd.org/code/nix_utf8_encode.git
Log | Files | Refs | README | LICENSE

main.c (2121B)


      1 /*
      2 2018 David DiPaola
      3 licensed under CC0 (public domain, see https://creativecommons.org/publicdomain/zero/1.0/)
      4 */
      5 
      6 #include <unistd.h>
      7 
      8 #include <stdio.h>
      9 
     10 static unsigned char
     11 _byte_tohex(unsigned char byte) {
     12 	if ((byte >= '0') && (byte <= '9')) {
     13 		return (byte - '0') + 0x0;
     14 	}
     15 	else if ((byte >= 'A') && (byte <= 'F')) {
     16 		return (byte - 'A') + 0xA;
     17 	}
     18 	else if ((byte >= 'a') && (byte <= 'f')) {
     19 		return (byte - 'a') + 0xA;
     20 	}
     21 	else {
     22 		return 0xFF;
     23 	}
     24 }
     25 
     26 static void
     27 _utf8_write(unsigned int codepoint) {
     28 	int status;
     29 	size_t bytes_size;
     30 	if (codepoint <= 0x7F) {
     31 		const char bytes[] = {
     32 			codepoint
     33 		};
     34 		bytes_size = sizeof(bytes);
     35 		status = write(STDOUT_FILENO, bytes, bytes_size);
     36 	}
     37 	else if (codepoint <= 0x7FF) {
     38 		const char bytes[] = {
     39 			((0b110 << 5) | ((codepoint >> (6 * 1)) &  0b11111)),
     40 			((0b10  << 6) | ((codepoint >> (6 * 0)) & 0b111111)),
     41 		};
     42 		bytes_size = sizeof(bytes);
     43 		status = write(STDOUT_FILENO, bytes, bytes_size);
     44 	}
     45 	else if (codepoint <= 0xFFFF) {
     46 		const char bytes[] = {
     47 			((0b1110 << 4) | ((codepoint >> (6 * 2)) &   0b1111)),
     48 			((0b10   << 6) | ((codepoint >> (6 * 1)) & 0b111111)),
     49 			((0b10   << 6) | ((codepoint >> (6 * 0)) & 0b111111)),
     50 		};
     51 		bytes_size = sizeof(bytes);
     52 		status = write(STDOUT_FILENO, bytes, bytes_size);
     53 	}
     54 	else if (codepoint <= 0x10FFFF) {
     55 		const char bytes[] = {
     56 			((0b11110 << 3) | ((codepoint >> (6 * 3)) &    0b111)),
     57 			((0b10    << 6) | ((codepoint >> (6 * 2)) & 0b111111)),
     58 			((0b10    << 6) | ((codepoint >> (6 * 1)) & 0b111111)),
     59 			((0b10    << 6) | ((codepoint >> (6 * 0)) & 0b111111)),
     60 		};
     61 		bytes_size = sizeof(bytes);
     62 		status = write(STDOUT_FILENO, bytes, bytes_size);
     63 	}
     64 	if (status < bytes_size) {
     65 		perror(NULL);
     66 	}
     67 }
     68 
     69 int
     70 main() {
     71 	unsigned char byte;
     72 	int           sawhex    = 0;
     73 	unsigned int  codepoint = 0;
     74 	while (read(STDIN_FILENO, &byte, sizeof(byte)) == sizeof(byte)) {
     75 		unsigned char byte_hex = _byte_tohex(byte);
     76 		if (byte_hex <= 0xF) {
     77 			sawhex = 1;
     78 			codepoint = (codepoint << 4) | byte_hex;
     79 		}
     80 		else if(sawhex) {
     81 			_utf8_write(codepoint);
     82 			codepoint = 0;
     83 			sawhex    = 0;
     84 		}
     85 	}
     86 
     87 	return 0;
     88 }
     89