Examples

convert

This example shows the possible ways to convert between UTF encodings, by converting a UTF-x range into UTF-8 and printing it.

#include <boost/unicode/utf.hpp>
#include <boost/range/as_literal.hpp>
#include <boost/foreach.hpp>
#include <vector>
#include <string>
#include <iostream>
#include <iterator>

namespace unicode = boost::unicode;

int main()
{
    // we use boost::as_literal in order not to have the trailing null character
    boost::iterator_range<const wchar_t*> foo = boost::as_literal(L"hello \u00E9 world");
    std::vector<boost::char32> bar;
    std::string baz;

Decoding and encoding eager, using two buffers:

unicode::utf_decode(foo, std::back_inserter(bar));
unicode::u8_encode(bar, std::back_inserter(baz));
std::cout << baz << std::endl;

bar.clear();
baz.clear();

Decoding and encoding eager, using one buffer and a stream iterator:

unicode::utf_decode(foo, std::back_inserter(bar));
unicode::u8_encode(bar, std::ostream_iterator<char>(std::cout));
std::cout << std::endl;

Decoding eager, stream iterator lazily encoded:

unicode::utf_decode(foo,
    unicode::adaptors::u8_encode_output(
        std::ostream_iterator<char>(std::cout)
    )
);
std::cout << std::endl;

Decoding is lazy, encoding is eager, using one buffer:

unicode::u8_encode(unicode::adaptors::utf_decode(foo), std::back_inserter(baz));
std::cout << baz << std::endl;

Decoding is lazy, encoding is eager, using stream iterators:

unicode::u8_encode(unicode::adaptors::utf_decode(foo), std::ostream_iterator<char>(std::cout));
std::cout << std::endl;

Fully lazy (recommended unless you need to store the result in a specific container):

   std::cout << unicode::adaptors::u8_encode(unicode::adaptors::utf_decode(foo)) << std::endl;
    
}

characters

This example shows the difference between code units, code points and graphemes, the different levels at which to approach the concept of character.

#include <boost/range/algorithm.hpp>
#include <boost/range/as_literal.hpp>
#include <boost/foreach.hpp>

#include <boost/unicode/utf.hpp>
#include <boost/unicode/graphemes.hpp>

#include <iostream>

namespace unicode = boost::unicode;

int main()
{
    char foo_[] = "foo\r\n\xc4\x93\xcc\x81 \xe1\xb8\x97";
    
    // We don't use foo_ directly because we don't want the terminating 0 to be part of the range
    boost::iterator_range<char*> foo = boost::as_literal(foo_);

    std::cout << "Code units: " << boost::distance(foo) << std::endl;
    std::cout << "Code points: " << boost::distance(unicode::adaptors::u8_segment(foo)) << std::endl; 
    std::cout << "Graphemes: " << boost::distance(unicode::adaptors::u8_grapheme_segment(foo)) << std::endl; 
    
    std::cout << std::endl;
    std::cout << "Code points boundaries:" << std::endl; 
    BOOST_FOREACH(boost::iterator_range<char*> cp, unicode::adaptors::u8_segment(foo)) 
        std::cout << '(' << cp << ')';
    std::cout << std::endl;
    
    std::cout << std::endl;
    std::cout << "Graphemes boundaries:" << std::endl;
    BOOST_FOREACH(boost::iterator_range<char*> grapheme, unicode::adaptors::u8_grapheme_segment(foo))
        std::cout << '(' << grapheme << ')';
    std::cout << std::endl;
}

	We could also have used `unicode::adaptors::u8_decode(foo)` here.
	We could also have used `unicode::adaptors::grapheme_segment(unicode::adaptors::u8_decode(foo))` here.
	Take into account your terminal may be interpreting the carriage return when viewing the output and thus visually produce strange results.
	In case the type of `cp` is too difficult to deduce, you could use `BOOST_FOREACH_AUTO` if your platform supports it or type erasure with `boost::lazy_range<char>` if you don't mind the overhead.

compose

This example shows how to decompose, recompose, normalize strings and how to maintain a normalized form while concatenating two strings.

#include <boost/unicode/compose.hpp>
#include <boost/unicode/cat.hpp>

#include <boost/range/algorithm/copy.hpp>
#include <boost/assign/list_of.hpp> 

#include <iostream>
#include <iterator>

namespace unicode = boost::unicode;
namespace ucd = unicode::ucd;
using boost::assign::list_of; 
using boost::char32;

We're going to do a lot of copying to std::out with spaces to delimit the elements in that example, so we instantiate the iterator once:

static std::ostream_iterator<char32> output(std::cout, " ");

int main()
{
    char32 cp = 0x1E17;
    
    // We want all results in hexadecimal
    std::cout << std::hex;
    
    std::cout << "Decomposition of U+01E17 within the UCD: ";
    boost::copy(ucd::get_decomposition(cp), output);
    std::cout << std::endl;
    std::cout << "Decomposition type: " << as_string(ucd::get_decomposition_type(cp)) << std::endl;
    
    std::cout << "Canonical decomposition: ";
    unicode::decompose(list_of(cp), output);
    std::cout << std::endl;
    
    std::cout << "Canonical decomposition of U+00A8: ";
    unicode::decompose(list_of(0xA8), output);
    std::cout << std::endl;
    std::cout << "Compatibility decomposition of U+00A8: ";
    unicode::decompose(list_of(0xA8), output, UINT_MAX);
    std::cout << std::endl;
    std::cout << std::endl;
    
    char32 foo[] = { 0x113, 0x301 };
    std::cout << "Canonical composition of { ";
    boost::copy(foo, output);
    std::cout << "}: ";
    unicode::compose(foo, output);
    std::cout << std::endl;
    
    char32 foo2[] = { 0x65, 0x304, 0x301 };
    std::cout << "Canonical composition of { ";
    boost::copy(foo2, output);
    std::cout << "}: ";
    unicode::compose(foo2, output);
    std::cout << std::endl;
    
    char32 bar[] = { 0x20, 0x308 };
    std::cout << "Canonical composition of { ";
    boost::copy(bar, output);
    std::cout << "}: ";
    unicode::compose(bar, output);
    std::cout << std::endl << std::endl;
    
    char32 baz[] = { cp, 0x330 };
    std::cout << "Normalization C of { ";
    boost::copy(baz, output);
    std::cout << "}: ";
    unicode::normalize(baz, output);
    std::cout << std::endl;
    
    char32 cat_dec1[] = { 0x48, 0x65, 0x304, 0x301 };
    char32 cat_dec2[] = { 0x330, 0x49 };
    std::cout << "Concatenation of the two decomposed strings { ";
    boost::copy(cat_dec1, output);
    std::cout << "} and { ";
    boost::copy(cat_dec2, output);
    std::cout << "}: ";
    unicode::decomposed_concat(cat_dec1, cat_dec2, output);
    std::cout << std::endl;
    
    char32 cat_comp1[] = { 0x48, 0x1e17 };
    char32 cat_comp2[] = { 0x330, 0x49 };
    std::cout << "Concatenation of the two composed strings { ";
    boost::copy(cat_comp1, output);
    std::cout << "} and { ";
    boost::copy(cat_comp2, output);
    std::cout << "}: ";
    unicode::composed_concat(cat_comp1, cat_comp2, output);
    std::cout << std::endl;
}

search

This example shows how to search a substring within a string, at the grapheme level, using two methods.

In this example we're going to use BOOST_AUTO as the return type of some of the functions is unspecified, but ideally you should try to avoid that dependency by not naming the variables at all, or rely on a type deduction system the library doesn't provide yet.

#include <boost/algorithm/string.hpp>
#include <boost/unicode/search.hpp>

#include <boost/typeof/typeof.hpp>
#include <iostream>

#include <boost/unicode/graphemes.hpp>
#include <boost/unicode/utf.hpp>

namespace unicode = boost::unicode;

int main()
{

We define the string we're going to search into, "foo<combining circumflex accent>foo" as well as it's version in terms of graphemes

char foo[] = "foo\xcc\x82" "foo";
BOOST_AUTO(foo_bounded, unicode::adaptors::utf_grapheme_segment(boost::as_literal(foo)));

We do the same thing for the string we're going to look for, "foo"

char search[] = "foo";
BOOST_AUTO(search_bounded, unicode::adaptors::utf_grapheme_segment(boost::as_literal(search)));

We perform the search using the ranges of graphemes, i.e. the Segmenter-based approach:

BOOST_AUTO(range_segmenter, boost::algorithm::find_first(foo_bounded, search_bounded));

We perform the search using the original range, but using an adapted Boost.StringAlgo Finder with the relevant BoundaryChecker:

BOOST_AUTO(finder,
    boost::algorithm::make_boundary_finder(
        boost::algorithm::first_finder(search),
        unicode::utf_grapheme_boundary()
    )
);
boost::iterator_range<char*> range_boundary = boost::algorithm::find(foo, finder);

We now display the resulting matches, which should both be pointing to the second occurrence, with their positions within the original UTF-8 string:

    std::cout << "[" << std::distance(boost::begin(foo), range_segmenter.begin().base()) << ", " << std::distance(boost::begin(foo), range_segmenter.end().base()) << "] ";
    std::cout << range_segmenter << std::endl;
    
    std::cout << "[" << std::distance(boost::begin(foo), range_boundary.begin()) << ", " << std::distance(boost::begin(foo), range_boundary.end()) << "] ";
    std::cout << range_boundary << std::endl;
}

source_input

This example shows how to input some non-ASCII unicode characters into source files in different means, following the legacy guidelines, and then how to convert them to displayable UTF-8.

The following strings are considered:

$ U+00A2 U+20AC U+024B62
hello U+00E9 world

#include <boost/unicode/string_cp.hpp>

#include <boost/unicode/utf.hpp>
#include <boost/unicode/static_utf.hpp>
#include <iostream>

namespace mpl = boost::mpl;
namespace unicode = boost::unicode;

Direct UTF-8:

char direct_utf8_1[] = "$¢€𤭢";
char direct_utf8_2[] = "hello é world";

UTF-8 as escape sequences:

char escape_utf8_1[] = "$\xC2\xA2\xE2\x82\xAC\xF0\xA4\xAD\xA2";
char escape_utf8_2[] = "hello \xC3\xA9 world";

Probably UTF-X (either 16 or 32) as Unicode escape sequences:

wchar_t utfx_1[] = L"$\u00A2\u20AC\U00024B62";
wchar_t utfx_2[] = L"hello \u00E9 world";

Compile-time direct UTF-8:

typedef mpl::string<'$', '¢', '€', '𤭢'> static_direct_utf8_1;
typedef mpl::string<'hell', 'o ', 'é', ' wor', 'ld'> static_direct_utf8_2;

Compile-time UTF-8 with Unicode code points:

typedef mpl::string<'$', unicode::string_cp<0xA2>::value, unicode::string_cp<0x20AC>::value, unicode::string_cp<0x24B62>::value> static_escape_utf8_1;
typedef mpl::string<'hell', 'o ', unicode::string_cp<0xE9>::value, ' wor', 'ld'> static_escape_utf8_2;

Compile-time UTF-16:

typedef mpl::u16string<'$', 0xA2, 0x20AC, 0xD852, 0xDF62> static_utf16_1;
typedef mpl::u16string<'h', 'e', 'l', 'l', 'o', ' ', 0xE9, ' ', 'w', 'o', 'r', 'l', 'd'> static_utf16_2;

Compile-time UTF-32:

typedef mpl::u32string<'$', 0xA2, 0x20AC, 0x24B64> static_utf32_1;
typedef mpl::u32string<'h', 'e', 'l', 'l', 'o', ' ', 0xE9, ' ', 'w', 'o', 'r', 'l', 'd'> static_utf32_2;

Let's display everything as UTF-8:

int main()
{

For strings that are already in UTF-8, nothing to do:

std::cout << direct_utf8_1 << std::endl;
std::cout << direct_utf8_2 << std::endl;
std::cout << escape_utf8_1 << std::endl;
std::cout << escape_utf8_2 << std::endl;

Wide strings are either UTF-16 or UTF-32, we convert them to UTF-32 then back to UTF-8:

std::cout << unicode::adaptors::u8_encode(unicode::adaptors::utf_decode(utfx_1)) << std::endl;
std::cout << unicode::adaptors::u8_encode(unicode::adaptors::utf_decode(utfx_2)) << std::endl;

Compile-time UTF-8 strings just need to be converted to runtime ones:

std::cout << mpl::c_str<static_direct_utf8_1>::value << std::endl;
std::cout << mpl::c_str<static_direct_utf8_2>::value << std::endl;
std::cout << mpl::c_str<static_escape_utf8_1>::value << std::endl;
std::cout << mpl::c_str<static_escape_utf8_2>::value << std::endl;

Compile-time decoding of UTF-16 is unsupported at the moment, so we cannot convert static_utf16_1 and static_utf16_2 to displayable UTF-8.

Compile-time UTF-32 can be converted at compile-time to UTF-8 then converted to runtime strings:

    std::cout << mpl::c_str<unicode::static_u8_encode<static_utf32_1>::type>::value << std::endl;
    std::cout << mpl::c_str<unicode::static_u8_encode<static_utf32_2>::type>::value << std::endl;
}