In this article I described how to retrieve data points and their positions during parsing. However, this method has a little flaw, it doesn't work correctly with Unicode input. We're going to fix this problem now.

Since we rely on the underlying iterator, the position depends on the used iterator type. Commonly the std::string::const_iterator is used and that does not play well with Unicode input, since in UTF-8 encoding some (or actually many) symbols take more then one character of the stream.

For solving this problem we need an Unicode aware iterator, which we get by wrapping our iterator in a boost::u8_to_u32_iterator.

typedef boost::u8_to_u32_iterator<std::string::const_iterator> iterator_type;

There is some more work to do. String attributes now reflect Unicode symbols in the UTF-32 encoding which doesn't match with std::string. So, we need to catch the attribute as std::u32string and convert this back to std::string again using boost iterators:

std::string to_utf8(const std::u32string& input) {
  return std::string(
      boost::u32_to_u8_iterator<std::u32string::const_iterator>(input.begin()),
      boost::u32_to_u8_iterator<std::u32string::const_iterator>(input.end()));
}
BOOST_PHOENIX_ADAPT_FUNCTION(std::string, to_utf8_, to_utf8, 1)

The macro in the last line allows to call the function in a semantic action directly, otherwise we would need boost::phoenix::bind for this. As said we have to wrap the output in a std::u32string first, so parser rules may look like this:

  qi::rule<iterator_type, std::u32string()> string_u32 =
      *(qi::standard_wide::char_ - qi::eoi);

  qi::rule<iterator_type, std::string()> string =
      string_u32 [qi::_val = to_utf8_(qi::_1)];

We could also wrap these two rules in one larger rule:

  qi::rule<iterator_type, std::string()> string =
      boost::spirit::attr_cast<std::u32string, std::u32string>(
        *(qi::standard_wide::char_ - qi::eoi)) [qi::_val = to_utf8_(qi::_1)];

Complete example

Well, enough said, here is the full example to play:

#define BOOST_SPIRIT_USE_PHOENIX_V3
#define BOOST_SPIRIT_UNICODE

#include <boost/regex/pending/unicode_iterator.hpp>
#include <boost/fusion/adapted/std_tuple.hpp>
#include <boost/phoenix/function/adapt_function.hpp>

#include <boost/spirit/include/phoenix.hpp>
namespace phx = boost::phoenix;

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/repository/include/qi_iter_pos.hpp>
namespace qi = boost::spirit::qi;

#include <iostream>
#include <string>
#include <tuple>

//==============================================================================
template<typename Iterator>
struct CurrentPos {
  CurrentPos() {
    save_start_pos = qi::omit[boost::spirit::repository::qi::iter_pos[
            phx::bind(&CurrentPos::setStartPos, this, qi::_1)]];
    current_pos = boost::spirit::repository::qi::iter_pos[
            qi::_val = phx::bind(&CurrentPos::getCurrentPos, this, qi::_1)];
  }

  qi::rule<Iterator> save_start_pos;
  qi::rule<Iterator, std::size_t()> current_pos;

private:
  void setStartPos(const Iterator &iterator) {
    start_pos_ = iterator;
  }

  std::size_t getCurrentPos(const Iterator &iterator) {
    return std::distance(start_pos_, iterator);
  }

  Iterator start_pos_;
};

//==============================================================================
std::string to_utf8(const std::u32string& input) {
  return std::string(
      boost::u32_to_u8_iterator<std::u32string::const_iterator>(input.begin()),
      boost::u32_to_u8_iterator<std::u32string::const_iterator>(input.end()));
}
BOOST_PHOENIX_ADAPT_FUNCTION(std::string, to_utf8_, to_utf8, 1)

//==============================================================================
int main() {
  std::string input(u8"Hello Unicode world! ♠♣♥♦ äöüß");

  typedef boost::u8_to_u32_iterator<std::string::const_iterator> iterator_type;

  iterator_type first(input.begin()),
                last(input.end());

  typedef std::tuple<std::string, std::size_t> ast_t;

  CurrentPos<iterator_type> current_pos;

  qi::rule<iterator_type, std::u32string()> string_u32 = 
      *(qi::standard_wide::char_ - qi::eoi);

  qi::rule<iterator_type, std::string()> string =
      string_u32 [qi::_val = to_utf8_(qi::_1)];

  qi::rule<iterator_type, ast_t()> rule =
      current_pos.save_start_pos >> string >> current_pos.current_pos;

  ast_t data;
  bool result = qi::parse(first, last, rule, data);
  if (result) {
    result = first == last;
  }

  if (result) {
    std::cout << "Parsed: " << std::get<0>(data) << std::endl
              << "Length: " << std::get<1>(data) << std::endl;
  } else {
    std::cout << "Failure" << std::endl;
  }
}

Output:

Parsed: Hello Unicode world! ♠♣♥♦ äöüß
Length: 30

Without using the Unicode adaption we would get length 42, since äöüß each take two characters and ♠♣♥♦ each take three.