Thiago R. Adams website

Home CodeBlog Articles Downloads Links Books About

Websites

Home made parser

See also : http://www.thradams.com/codeblog/tkgen.htm

//
// Copyright (C) 2009, Thiago R. Adams
// http://www.thradams.com/
// Permission to copy, use, modify, sell and distribute this software
// is granted provided this copyright notice appears in all copies.
// This software is provided "as is" without express or implied
// warranty, and with no claim as to its suitability for any purpose.
//

#ifndef __PARSER_H__
#define __PARSER_H__

#include <iostream>
#include <fstream>
#include <string>
#include <vector>

#ifndef ASSERT
#include <cassert>
#define ASSERT assert
#endif

namespace Tra
{
    namespace Detail
    {
        template<class T>
        bool CheckRange(const T &v, const T & begin, const T & end)
        {
            return v >= begin && v <= end;
        }

        template<class Char> bool IsDigit(Char c)
        {
            return CheckRange(c, Char('0'), Char('9'));
        }

        template<class Char> bool IsLetter(Char c)
        {
            return CheckRange(c, Char('A'), Char('Z')) ||
                   CheckRange(c, Char('a'), Char('z')) ||
                   c == Char('_') ||
                   c == Char('&') ||
                   c == Char(';');
        }

        template<class Iterator>
        bool IsTokenSymbol(Iterator &it)
        {
            if (IsLetter(*it))
            {
                ++it;
                //a2 ab a_2 etc...
                while (IsLetter(*it) || IsDigit(*it))
                {
                    ++it;
                }
                return true;
            }
            return false;
        }

        template<class Iterator>
        int IsTokenNumber(Iterator &it)
        {
            int result = 0;
            if (*it == '-' || IsDigit(*it))
            {
                ++it;
                while (IsDigit(*it))
                {
                    ++it;
                }

                result = 1; //integer

                while (IsDigit(*it) || *it == '.' ||
                        *it == 'e' || *it == ('E') ||
                        *it == '+' || *it == ('-'))
                {
                    ++it;
                }
                result = 2;//double
            }
            return result;
        }

    } // namespace Detail

    //Customizable class
    template <class CharType>
    struct Tokens
    {
        typedef CharType Token;

        //Token list
        static const Token TokenEof     = 0; //Reserved for eof
        static const Token TokenBlank   = 1; //Reserved for blanks
        
        //yours
        static const Token TokenSymbol  = 2;
        static const Token TokenInteger = 3;
        static const Token TokenDouble  = 4;
        

        //Token finder function must be implemented
        template<class Iterator>
        static Token FindToken(Iterator& it)
        {
            Token NextToken = 0;
            int r = Detail::IsTokenNumber(it);
            if (r != 0)
            {
                NextToken = r == 1 ? TokenInteger : TokenDouble;
            }
            else if (Detail::IsTokenSymbol(it))
            {
                NextToken = TokenSymbol;
            }
            else if (*it < 33)
            {
                NextToken = TokenBlank;
                while (*it < 33)
                {
                    if (*it == TokenEof)
                        break;
                    ++it;
                }
            }
            else
            {
                NextToken = *it;
                ++it;
            }
            return NextToken;
        }

        static bool IsBlank(Token tk)
        {
            return tk == TokenBlank;
        }
    };

    template < class CharType, 
               class TokensClass = Tokens<CharType>, 
               int BufferSize = 4024>
    class BasicParser : public TokensClass
    {
    public:
        typedef CharType Char;
        typedef Char Token;
        typedef std::basic_string<Char> String;
        typedef unsigned int Position;


    private:
        typedef std::vector<Char> Buffer;
        typedef std::basic_istream<Char> Stream;
        typedef typename Buffer::iterator Iterator;
        static const Char NullChar = '\0';

        Stream&  m_Stream;
        Buffer   m_Buffer;
        Iterator m_BufferEndIt;
        Iterator m_BufferIt;
        Iterator m_SourceIt;
        Iterator m_SourceLineEndIt;
        Iterator m_SourceTokenIt;
        Char     m_SourceEndChar;

        Token    m_Token;
        size_t   m_CharPos;
        bool     m_SkipWs;
        Position m_LinePos;

        template<class iter>
        static iter FindLineStart(iter pbuffer, iter pbufferPosition)
        {
            ASSERT(pbufferPosition >= pbuffer);
            for (; pbufferPosition > pbuffer; --pbufferPosition)
            {
                if (*pbufferPosition == '\n')
                    break;
            }
            return pbufferPosition;
        }

        void ReadBuffer()
        {
            // update current Position
            m_CharPos += std::distance(m_Buffer.begin(), m_SourceIt);

            //restore the source end
            *m_SourceLineEndIt = m_SourceEndChar;

            // there are characters remaining?
            const size_t nRemainingChars = m_BufferIt - m_SourceIt;

            if (nRemainingChars != 0)
            {
                //move the remaining chars to the buffer begin
                std::copy(m_SourceIt, m_BufferIt, m_Buffer.begin());
            }

            // set the BufferPtr to the last remaining char (after copy to begin)
            m_BufferIt = m_Buffer.begin() + nRemainingChars;

            // fill the buffer until the end
            const int nchars = std::distance(m_BufferIt, m_BufferEndIt);
            m_Stream.read(&m_BufferIt[0], nchars /* * sizeof (Char)*/);

            if (m_Stream.bad())
                throw std::runtime_error("parser: bad stream");

            // set BufferPtr to the last char read
            const std::streamsize nc = m_Stream.gcount();

            m_BufferIt += nc;

            // source pointer returns to the begin
            m_SourceIt = m_Buffer.begin();

            // source end is the last char read
            m_SourceLineEndIt = m_BufferIt;

            if (m_SourceLineEndIt == m_BufferEndIt)
            {
                //return the source end to the line start
                m_SourceLineEndIt = FindLineStart(m_Buffer.begin(), m_SourceLineEndIt);

                if (m_SourceLineEndIt == m_Buffer.begin())
                {
                    throw std::overflow_error("parser: line too long");
                }
            }
            else
            {
                //last line, don't need to return
            }

            ASSERT(m_BufferIt >= m_SourceLineEndIt);

            // save the end point char
            m_SourceEndChar = *m_SourceLineEndIt;

            //marks the end point with null character
            *m_SourceLineEndIt = NullChar;
        }

        void UpdateBuffer()
        {
            for (;;)
            {
                // this is the first time to read the buffer
                // or the sourcebuf end point has been reached
                if (*m_SourceIt == NullChar)
                {
                    ReadBuffer(); //read more chars...

                    if (*m_SourceIt == NullChar)
                        return; //there is no more chars to read

                    continue;
                }

                if (*m_SourceIt == 10)
                    m_LinePos++;

                return;
            }
        }

        const BasicParser & operator =(const BasicParser &); //not imp
        BasicParser(const BasicParser &); //not imp

    public:

        BasicParser(Stream &stream,
                    bool Skipws = true,
                    size_t buffer_size = BufferSize)
                :
                m_Buffer(buffer_size > 2 ? buffer_size : 2, NullChar),
                m_Stream(stream),
                m_Token(TokenEof),
                m_LinePos(1),
                m_SkipWs(Skipws),
                m_CharPos(0)
        {
            if (!m_Stream)
            {
                throw std::runtime_error("parser: invalid istream");
            }

            //points always to the buffer's end
            m_BufferEndIt = m_Buffer.end() - 1;

            m_BufferIt = m_Buffer.begin();

            //represents the end point used
            m_SourceLineEndIt = m_BufferIt;

            // cursor
            m_SourceIt = m_BufferIt;

            //points to the Tokens begin
            m_SourceTokenIt = m_BufferIt;

            NextToken();
        }

        Token NextToken()
        {
            for (;;)
            {
                UpdateBuffer();
                m_SourceTokenIt = m_SourceIt;
             
                if (*m_SourceIt == TokenEof)
                {
                  m_Token = TokenEof;//reserved
                }
                else
                {
                  m_Token = FindToken(m_SourceIt);
                }

                if (Skipws() && IsBlank(m_Token))
                {
                }
                else
                    break;
            }

            return m_Token;
        }

        bool Eof() const
        {
          return GetToken() == TokenEof;
        }

        Token GetToken() const
        {
            return m_Token;
        }

        bool Skipws() const
        {
            return m_SkipWs;
        }

        bool Skipws(bool sws)
        {
            bool previous(Skipws());
            m_SkipWs = sws;
            return previous;
        }

        Position GetCharPos() const
        {
            return static_cast<Position>(m_CharPos + (m_SourceTokenIt - m_Buffer.begin()));
        }

        Position GetLinePos() const
        {
            return m_LinePos;
        }

        Position GetColPos() const
        {
            BasicParser * p = const_cast<BasicParser*>(this);
            Iterator it = FindLineStart(p->m_Buffer.begin(), p->m_SourceTokenIt);
            return static_cast<Position>(m_SourceTokenIt - it);
        }

        String GetTokenStr() const
        {
            return String(&m_SourceTokenIt[0],
                          std::distance(m_SourceTokenIt, m_SourceIt));
        }
    };

    typedef Tra::BasicParser<char> AParser;
    typedef Tra::BasicParser<wchar_t> WParser;    
    typedef WParser Parser;
    
} //namespace Tra

#endif

Want to see more? Go to the CodeBlog section.

About the author: I am Thiago Adams. I work as a professional C++ software engineer. I have created this website to share ideas and source code with other people with similar interests.
I would like to hear from you comments, critics, questions and suggestions about this topic or any other part of this website. Email: thiago.adams at gmail dot com