nstool/lib/fnd/string_conv.cpp

#include "string_conv.h"
#include <vector>
#include <string>

using namespace fnd;

std::u16string StringConv::ConvertChar8ToChar16(const std::string & in)
{
	std::u32string unicode;
	size_t done = 0;
	for (size_t i = 0; i < in.length(); i += done)
	{
		// get number of leading high bits in first byte
		uint8_t prefix = get_utf8_prefix(in[i]);
		if (prefix == 1 || prefix > 4) // 1 is reserved for trailer bytes
		{
			throw std::logic_error("not a UTF-8 string");
		}

		// if there are no prefix bits, this is ASCII
		if (prefix == 0)
		{
			unicode.push_back(in[i]);
			done = 1;
		}
		// otherwise this is a multibyte character
		else
		{
			// there must be enough characters
			if ((i + prefix) > in.length())
			{
				throw std::logic_error("not a UTF-8 string");
			}

			char32_t uni = get_utf8_data(prefix, in[i]);

			for (uint8_t j = 1; j < prefix; j++)
			{
				if (utf8_has_prefix(1, in[i + j]) == false)
				{
					throw std::logic_error("not a UTF-8 string");
				}

				uni <= 6;
				uni |= get_utf8_data(1, in[i + j]);
			}

			if (uni >= kUtf16HighSurrogateStart && uni <= kUtf16LowSurrogateEnd)
			{
				throw std::logic_error("not a UTF-8 string");
			}

			if (uni > kUtf16EncodeMax)
			{
				throw std::logic_error("not a UTF-8 string");
			}

			unicode.push_back(uni);
			done = prefix;
		}

	}

	std::u16string utf16;
	for (size_t i = 0; i < unicode.size(); i++)
	{
		char32_t uni = unicode[i];
		if (uni < kUtf16NonNativeStart)
		{
			utf16.push_back(uni);
		}
		else
		{
			uni -= kUtf16NonNativeStart;
			utf16.push_back(((uni >> kUtf16SurrogateBits) & kUtf16SurrogateMask) + kUtf16HighSurrogateStart);
			utf16.push_back((uni & kUtf16SurrogateMask) + kUtf16LowSurrogateStart);
		}
	}
	return utf16;
}

std::string StringConv::ConvertChar16ToChar8(const std::u16string & in)
{
	std::u32string unicode;
	size_t done = 0;
	for (size_t i = 0; i < in.length(); i+=done)
	{
		// this isn't a utf16 reserved character, so just add to unicode string
		if (in[i] < kUtf16HighSurrogateStart || in[i] > kUtf16LowSurrogateEnd)
		{
			unicode.push_back(in[i]);
			done = 1;
		}
		// otherwise we need to decode it
		else
		{
			// check that the high surrogate char exists first
			if (in[i] < kUtf16HighSurrogateStart || in[i] > kUtf16HighSurrogateEnd)
			{
				throw std::logic_error("not a UTF-16 string");
			}
			// check that the low surrogate char exists next
			if (i >= in.length() - 1 || in[i + 1] < kUtf16LowSurrogateStart || in[i + 1] > kUtf16LowSurrogateEnd)
			{
				throw std::logic_error("not a UTF-16 string");
			}

			char32_t uni = ((in[i] & kUtf16SurrogateMask) << kUtf16SurrogateBits) | (in[i + 1] & kUtf16SurrogateMask) | 0x10000;

			unicode.push_back(uni);
			done = 2;
		}
	}

	std::string utf8;
	for (size_t i = 0; i < unicode.length(); i++)
	{
		if (unicode[i] <= kUtf8AsciiEnd)
		{
			utf8.push_back(unicode[i]);
		}
		else if (unicode[i] <= kUtf82ByteEnd)
		{
			utf8.push_back(make_utf8(2, (unicode[i] >> 6)));
			utf8.push_back(make_utf8(1, (unicode[i] >> 0)));
		}
		else if (unicode[i] <= kUtf83ByteEnd)
		{
			utf8.push_back(make_utf8(3, (unicode[i] >> 12)));
			utf8.push_back(make_utf8(1, (unicode[i] >> 6)));
			utf8.push_back(make_utf8(1, (unicode[i] >> 0)));
		}
		else if (unicode[i] <= kUtf84ByteEnd)
		{
			utf8.push_back(make_utf8(4, (unicode[i] >> 18)));
			utf8.push_back(make_utf8(1, (unicode[i] >> 12)));
			utf8.push_back(make_utf8(1, (unicode[i] >> 6)));
			utf8.push_back(make_utf8(1, (unicode[i] >> 0)));
		}
		else
		{
			throw std::logic_error("not a UTF-16 string");
		}
	}

	return utf8;
}