/** *! * \file algo_utf8_unicode.c * \version v0.0.1 * \date 2020/06/25 * \author Bean(notrynohigh@outlook.com) ******************************************************************************* * @attention * * Copyright (c) 2020 Bean * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. ******************************************************************************* */ /*Includes ----------------------------------------------*/ #include "inc/algo_utf8_unicode.h" #include <string.h> #if (defined(_ALGO_UNICODE_ENABLE) && (_ALGO_UNICODE_ENABLE == 1)) /** * \addtogroup ALGORITHM * \{ */ /** * \addtogroup UTF8_UNICODE * \{ */ /** * \defgroup UTF8_UNICODE_Private_TypesDefinitions * \{ */ /** * \} */ /** * \defgroup UTF8_UNICODE_Private_Defines * \{ */ /** * \} */ /** * \defgroup UTF8_UNICODE_Private_Macros * \{ */ /** * \} */ /** * \defgroup UTF8_UNICODE_Private_Variables * \{ */ /** * \} */ /** * \defgroup UTF8_UNICODE_Private_FunctionPrototypes * \{ */ /** * \} */ /** * \defgroup UTF8_UNICODE_Private_Functions * \{ */ /** * \} */ /** * \addtogroup UTF8_UNICODE_Exported_Functions * \{ */ int utf8_to_unicode_size(const uint8_t utf8) { uint8_t c = utf8; // 0xxxxxxx 返回0 // 10xxxxxx 不存在 // 110xxxxx 返回2 // 1110xxxx 返回3 // 11110xxx 返回4 // 111110xx 返回5 // 1111110x 返回6 if (c < 0x80) return 0; if (c >= 0x80 && c < 0xC0) return -1; if (c >= 0xC0 && c < 0xE0) return 2; if (c >= 0xE0 && c < 0xF0) return 3; if (c >= 0xF0 && c < 0xF8) return 4; if (c >= 0xF8 && c < 0xFC) return 5; if (c >= 0xFC) return 6; return -1; } int unicode_to_utf8(unsigned long unic, unsigned char *utf8, int utf8_size) { if (utf8 == NULL || utf8_size < 6) { return -1; } if (unic <= 0x0000007F) { // * U-00000000 - U-0000007F: 0xxxxxxx *utf8 = (unic & 0x7F); return 1; } else if (unic >= 0x00000080 && unic <= 0x000007FF) { // * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx *(utf8 + 1) = (unic & 0x3F) | 0x80; *utf8 = ((unic >> 6) & 0x1F) | 0xC0; return 2; } else if (unic >= 0x00000800 && unic <= 0x0000FFFF) { // * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx *(utf8 + 2) = (unic & 0x3F) | 0x80; *(utf8 + 1) = ((unic >> 6) & 0x3F) | 0x80; *utf8 = ((unic >> 12) & 0x0F) | 0xE0; return 3; } else if (unic >= 0x00010000 && unic <= 0x001FFFFF) { // * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx *(utf8 + 3) = (unic & 0x3F) | 0x80; *(utf8 + 2) = ((unic >> 6) & 0x3F) | 0x80; *(utf8 + 1) = ((unic >> 12) & 0x3F) | 0x80; *utf8 = ((unic >> 18) & 0x07) | 0xF0; return 4; } else if (unic >= 0x00200000 && unic <= 0x03FFFFFF) { // * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(utf8 + 4) = (unic & 0x3F) | 0x80; *(utf8 + 3) = ((unic >> 6) & 0x3F) | 0x80; *(utf8 + 2) = ((unic >> 12) & 0x3F) | 0x80; *(utf8 + 1) = ((unic >> 18) & 0x3F) | 0x80; *utf8 = ((unic >> 24) & 0x03) | 0xF8; return 5; } else if (unic >= 0x04000000 && unic <= 0x7FFFFFFF) { // * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(utf8 + 5) = (unic & 0x3F) | 0x80; *(utf8 + 4) = ((unic >> 6) & 0x3F) | 0x80; *(utf8 + 3) = ((unic >> 12) & 0x3F) | 0x80; *(utf8 + 2) = ((unic >> 18) & 0x3F) | 0x80; *(utf8 + 1) = ((unic >> 24) & 0x3F) | 0x80; *utf8 = ((unic >> 30) & 0x01) | 0xFC; return 6; } return 0; } int utf8_to_unicode(const uint8_t *utf8, uint32_t *unicode) { if (utf8 == NULL || unicode == NULL) { return -1; } char b1, b2, b3, b4, b5, b6; *unicode = 0x0; int utfbytes = utf8_to_unicode_size(*utf8); uint8_t *ptmp = (uint8_t *)unicode; switch (utfbytes) { case 0: *ptmp = *utf8; utfbytes += 1; break; case 2: b1 = *utf8; b2 = *(utf8 + 1); if ((b2 & 0xE0) != 0x80) return 0; *ptmp = (b1 << 6) + (b2 & 0x3F); *(ptmp + 1) = (b1 >> 2) & 0x07; break; case 3: b1 = *utf8; b2 = *(utf8 + 1); b3 = *(utf8 + 2); if (((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)) return 0; *ptmp = (b2 << 6) + (b3 & 0x3F); *(ptmp + 1) = (b1 << 4) + ((b2 >> 2) & 0x0F); break; case 4: b1 = *utf8; b2 = *(utf8 + 1); b3 = *(utf8 + 2); b4 = *(utf8 + 3); if (((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) || ((b4 & 0xC0) != 0x80)) return 0; *ptmp = (b3 << 6) + (b4 & 0x3F); *(ptmp + 1) = (b2 << 4) + ((b3 >> 2) & 0x0F); *(ptmp + 2) = ((b1 << 2) & 0x1C) + ((b2 >> 4) & 0x03); break; case 5: b1 = *utf8; b2 = *(utf8 + 1); b3 = *(utf8 + 2); b4 = *(utf8 + 3); b5 = *(utf8 + 4); if (((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)) return 0; *ptmp = (b4 << 6) + (b5 & 0x3F); *(ptmp + 1) = (b3 << 4) + ((b4 >> 2) & 0x0F); *(ptmp + 2) = (b2 << 2) + ((b3 >> 4) & 0x03); *(ptmp + 3) = (b1 << 6); break; case 6: b1 = *utf8; b2 = *(utf8 + 1); b3 = *(utf8 + 2); b4 = *(utf8 + 3); b5 = *(utf8 + 4); b6 = *(utf8 + 5); if (((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) || ((b6 & 0xC0) != 0x80)) return 0; *ptmp = (b5 << 6) + (b6 & 0x3F); *(ptmp + 1) = (b5 << 4) + ((b6 >> 2) & 0x0F); *(ptmp + 2) = (b3 << 2) + ((b4 >> 4) & 0x03); *(ptmp + 3) = ((b1 << 6) & 0x40) + (b2 & 0x3F); break; default: return 0; } return utfbytes; } /** * \} */ /** * \} */ /** * \} */ #endif /************************ Copyright (c) 2020 Bean *****END OF FILE****/