Thiago R. Adams website

Home CodeBlog Articles Downloads Links Books About

Websites

Simple Tokenizer class

//
// Copyright (C) 2009, Thiago Adams (thiago.adams@gmail.com)
// Permission to copy, use, modify, sell and distribute this software
// is granted provided this copyright notice appears in all copies.
// This software is provided "as is" without express or implied
// warranty, and with no claim as to its suitability for any purpose.
//
#include <sstream>
#include <string>
#include <iostream>

template<class CharType>
class Tokenizer
{
public:
  typedef CharType Char;
  typedef Char Token;
  typedef std::basic_istream<Char> Istream;
  typedef std::basic_string<Char> String;

private:
  Istream & m_istream;
  Tokenizer(); // not imp
  int m_linepos;
  int m_colpos;
  int m_charpos;
  String m_tokenstr;
  Token m_token;
  static const Token TokenEof = 0;

protected:

  void ResetToken()
  {
    m_token = TokenEof;
    m_tokenstr.clear();
  }

  Tokenizer(Istream & i) : m_istream(i)
  {
    m_linepos =  m_colpos = m_charpos = 0;
  }

  void PutBack(Char ch)
  {
    m_istream.putback(ch);
    m_tokenstr.resize(m_tokenstr.size() - 1);
  }

  int Get()
  {
    if (m_istream.eof())
      return -1;

    Char ch = 0;
    m_istream.get(ch);
    m_charpos++;

    if (ch == '\n')
    {
      m_linepos++;
      m_colpos = 0;;
    }
    else
    {
      m_colpos++;
    }

    m_tokenstr += ch;

    return (int)ch;
  }

  void SetToken(Token t)
  {
    m_token = t;
  }

public:

  Token GetToken() const
  {
    return m_token;
  }

  const String & GetTokenStr() const
  {
    return m_tokenstr;
  }
};


class MyTokenizer : public  Tokenizer<wchar_t>
{
public:
  static const Token TokenNumber = 1;  
  static const Token TokenBlank = 2;
  static const Token TokenIdentifier = 3;

public:

  MyTokenizer(Istream & is) : Tokenizer(is)
  {
    Next();
  }

  bool Next()
  {
    ResetToken();
    
    int iChar = Get();
    if (iChar == -1)
      return false;
    Char ch = (Char) iChar;

    if (isdigit(ch))
    {
      SetToken(TokenNumber);
      for(;;)
      {
        int iChar = Get();
        if (iChar == -1)
          return true;
        char ch = (Char)iChar;

        if (!isdigit(ch))
        {
          PutBack(ch);
          break;
        }
      }
    }
    else if (isalpha(ch))
    {
      SetToken(TokenIdentifier);
      for(;;)
      {
        int iChar = Get();
        if (iChar == -1)
          return true;
        char ch = (Char)iChar;

        if (!isalpha(ch))
        {
          PutBack(ch);
          break;
        }
      }
    }
    else if (isspace(ch))
    {
      SetToken(TokenBlank);

      for (;;)
      {
        int iChar = Get();
        if (iChar == -1)
          return true;
        char ch = (Char)iChar;

        if (ch != ' ')
        {
          PutBack(ch);
          break;
        }
      }
    }
    return true;
  }
};

Sample


using namespace std;

const wchar_t * TokenStr(Tokenizer<wchar_t>::Token tk)
{
  switch(tk)
  {
  case 1: return  L"number     :";
  case 0: return  L"eof        :";
  case 2: return  L"blank      :";
  case 3: return  L"identifier :";
  }
  return L"";
};

int main()
{
  std::wstringstream ss;

  ss << L"identifier1    012334identifier2 identifier3";
  MyTokenizer tk(ss);

  do
  {
    wcout << TokenStr(tk.GetToken()) << "'" << tk.GetTokenStr() << "'" << endl;
  }while (tk.Next());

  wcout << TokenStr(tk.GetToken()) << "'" << tk.GetTokenStr() << "'" << endl;

  return 0;
}

Want to see more? Go to the CodeBlog section.

About the author: I am Thiago Adams. I work as a professional C++ software engineer. I have created this website to share ideas and source code with other people with similar interests.
I would like to hear from you comments, critics, questions and suggestions about this topic or any other part of this website. Email: thiago.adams at gmail dot com