`
dcaoyuan
  • 浏览: 299056 次
社区版块
存档分类
最新评论

A Simple XML State Machine Accepting SAX Events to Build xmerl Compitable XML Tree: icalendar demo

阅读更多

xmerl is a full XML functionality in Erlang, with a lot of features like XPATH, XSLT, event_function, acc_function etc. Well, now I just want to get icalendar to be parsed to form of xmerl tree, which will contain #xmlElement, #xmlAttribute, #xmlText etc, and easily to apply XPATH on it.

How about an approach that the parser just generates SAX events, and then, by attaching to a callback state machine to build a JSON or XML tree, or anything else?

I hoped xmerl is something like this, i.e. a parser to generate SAX events, and a state machine to accept the events and build the XML tree. I digged into xmerl's code, but, unfortunately, the parser and state machine are coupled together.

So I wrote a simple state machine which just receives SAX events to build a xmerl compitable XML tree. And, I applied it to icalendar.

I like this idea, by using SAX events as the common interface, I only need to write a another JSON state machine later, then, the result will be JSON of icalendar. I can share the same parser which just generates SAX events.

Here's the code, which is not completed yet, just to show how a SAX interface can serve a lot.

%%% A state machine which receives sax events and builds a xmerl compitable tree


-module(xml_sm).

-include_lib("xmerl/include/xmerl.hrl").

-export([state/2]).

-export([test/0
        ]).

-record(xmlsmState, {
    name = undefined,
    attributes = [],
    content = [],
    parents = []
}).

receive_events(Events) -> receive_events(Events, undefined).

receive_events([], _States) -> {ok, [], []};
receive_events([Event|T], States) ->
    case state(Event, States) of 
        {ok, TopElement} -> 
            {ok, TopElement, T};
        {error, Reason} -> 
            {error, Reason};
        States1 -> 
            receive_events(T, States1)    
    end.

state({startDocument}, _StateStack) ->
    State = #xmlsmState{},
    [State];
state({endDocument}, StateStack) ->
    %io:fwrite(user, "endDocument, states: ~p~n", [StateStack]),
    case StateStack of
        {ok, TopElement} -> {ok, TopElement};
        _ -> {error, "Bad element match"}
    end;
state({startElement, _Uri, LocalName, _QName, Attrs}, StateStack) ->
    %io:fwrite(user, "startElement~n", []),
    %% pop current State
    [State|_StatesPrev] = StateStack,
    #xmlsmState{attributes=_Attributes,
                content=_Content,
                parents=Parents} = State,
    {_Pos, Attributes1} = lists:foldl(
        fun ({Name, Value}, {Pos, AccAttrs}) ->
                Pos1 = Pos + 1,
                Attr = #xmlAttribute{name = Name,
                                     value = Value, 
                                     parents = [{LocalName, Pos1}|Parents]},
                {Pos1, [Attr|AccAttrs]}
        end, {0, []}, Attrs),
    Parents1 = [{LocalName, 0}|Parents],
    %% push new state of Attributes, Content and Parents to StateStack
    NewState = #xmlsmState{name = LocalName,
                           attributes = Attributes1,
                           content = [],
                           parents = Parents1},
    [NewState|StateStack];
state({endElement, _Uri, LocalName, _QName}, StateStack) ->
    %% pop current State
    [State|StatesPrev] = StateStack,
    #xmlsmState{name=Name,
                attributes=Attributes,
                content=Content,
                parents=Parents} = State,
    %io:fwrite(user, "Element end with Name: ~p~n", [Name]),
    if  LocalName == undefined -> %% don't care 
            undefined; 
        LocalName /= Name -> 
            throw(lists:flatten(io_lib:format(
                "Element name match error: ~p should be ~p~n", [LocalName, Name])));
        true -> undefined
    end,
    %% composite a new element
    [_|ParentsPrev] = Parents,
    Element = #xmlElement{name = Name,
                          attributes = Attributes,
                          content = lists:reverse(Content),
                          parents = ParentsPrev},
    %io:fwrite(user, "Element: ~p~n", [Element]),
    %% put Element to parent's content and return new state stack
    case StatesPrev of
        [ParentState|[]] -> %% reached the top now, return final result
            {ok, Element};
        [ParentState|Other] ->
            #xmlsmState{attributes=ParentAttributes,
                        content=ParentContent,
                        parents=ParentParents} = ParentState,
            ParentContent1 = [Element|ParentContent],
            %% update parent state and backward to it:
            ParentState1 = ParentState#xmlsmState{content = ParentContent1},
            %io:fwrite(user, "endElement, state: ~p~n", [State1]),
            [ParentState1|Other]
        end;
state({characters, Characters}, StateStack) ->
    %% pop current State
    [State|StatesPrev] = StateStack,
    #xmlsmState{attributes=_Attributes,
                content=Content,
                parents=Parents} = State,
    [{Parent, Pos}|ParentsPrev] = Parents,
    Pos1 = Pos + 1,
    Text = #xmlText{value = Characters,
                    parents = [{Parent, Pos1}|ParentsPrev]},
    Content1 = [Text|Content],
    Parents1 = [{Parent, Pos1}|ParentsPrev],
    UpdatedState = State#xmlsmState{content = Content1,
                                    parents = Parents1},
    [UpdatedState|StatesPrev].

test() ->
    Events = [
        {startDocument},
        {startElement, [], feed, [], [{link, "http://lightpole.net"}, {author, "Caoyuan"}]},
        {characters, "feed text"},
        {startElement, [], entry, [], [{tag, "Erlang, Function"}]},
        {characters, "Entry1's text"},
        {endElement, [], entry, []},
        {startElement, [], entry, [], []},
        {characters, "Entry2's text"},
        {endElement, [], entry, []},
        {endElement, [], feed, []},
        {endDocument}
    ],

    %% Streaming:
    {ok, Xml1, _Rest} = receive_events(Events),
    io:fwrite(user, "Streaming Result: ~n~p~n", [Xml1]),
    
    %% Stepped:
    FunCallback = fun xml_sm:state/2,
    FinalStates = lists:foldl(
        fun (Event, States) ->
                FunCallback(Event, States)
        end, undefined, Events),
    {ok, Xml2} = FinalStates,
    XmlText = lists:flatten(xmerl:export_simple([Xml2], xmerl_xml)),
    io:fwrite(user, "Stepped Result: ~n~p~n", [XmlText]).

And the primary icalendar front end:

-module(ical_parser).

-include_lib("xmerl/include/xmerl.hrl").

-export([parse/1
        ]).

-export([test/0
        ]).

-define(stateMachine, fun xml_sm:state/2).

parse(Text) ->
    States1 = ?stateMachine({startDocument}, undefined),
    States2 = parse_line(skip_ws(Text), 0, States1),
    States3 = ?stateMachine({endDocument}, States2).

parse_line([], _Line, States) -> States;
parse_line([$\s|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\t|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\r|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\n|T], Line, States) -> parse_line(T, Line + 1, States);
parse_line("BEGIN"++T, Line, States) ->
    case skip_ws(T) of
        [$:|T1] -> 
            {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []),
            %io:fwrite(user, "Component started: ~p~n", [Name]),
            States1 = ?stateMachine({startElement, [], Name, [], []}, States),
            parse_line(skip_ws(Rest), Line1, States1);
        _ -> error
    end;
parse_line("END"++T, Line, States) ->
    case skip_ws(T) of
        [$:|T1] -> 
            {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []),
            States1 = ?stateMachine({endElement, [], Name, []}, States),
            parse_line(skip_ws(Rest), Line1, States1);
        _ -> error        
    end;
parse_line(Text, Line, States) ->
    {Rest, Line1, {Name, Params}, Value} = parse_prop(skip_ws(Text), Line, States, {[], []}),
    States1 = ?stateMachine({startElement, [], Name, [], Params}, States),
    States2 = ?stateMachine({characters, Value}, States1),
    States3 = ?stateMachine({endElement, [], Name, []}, States2),
    parse_line(skip_ws(Rest), Line1, States3).

parse_component_name([$\r|T], Line, States, Name) -> parse_component_name(T, Line, States, Name);
parse_component_name([$\n|T], Line, States, Name) ->
    case unfolding_line(T) of
        {true,  Rest} -> parse_component_name(Rest, Line, States, Name);
        {false, Rest} -> {Rest, Line + 1, list_to_atom(string:to_lower(lists:reverse(Name)))}
    end;
parse_component_name([H|T], Line, States, Name) ->
    parse_component_name(skip_ws(T), Line, States, [H|Name]).
    
parse_prop([$:|T], Line, States, {Name, NameParams}) ->
    PropName = list_to_atom(string:to_lower(lists:reverse(Name))),
    PropNameParams = lists:reverse(NameParams),
    %io:fwrite(user, "parsed prop name: ~p, with params: ~p~n", [PropName, NameParams]), 
    {Rest, Line1, Value} = parse_prop_value(T, Line, States, []),
    %io:fwrite(user, "parsed prop : ~p~n", [{PropName, NameParams, Value}]), 
    {Rest, Line1, {PropName, PropNameParams}, Value};
parse_prop([$;|T], Line, States, {Name, NameParams}) ->
    {Rest, Line1, ParamName, ParamValue} = parse_param(T, Line, States, []),
    parse_prop(Rest, Line1, States, {Name, [{ParamName, ParamValue}|NameParams]});
parse_prop([H|T], Line, States, {Name, NameParams}) ->
    parse_prop(skip_ws(T), Line, States, {[H|Name], NameParams}).

parse_prop_value([$\r|T], Line, States, Value) -> parse_prop_value(T, Line, States, Value);
parse_prop_value([$\n|T], Line, States, Value) ->
    case unfolding_line(T) of
        {true,  Rest} -> parse_prop_value(Rest, Line, States, Value);
        {false, Rest} -> {Rest, Line + 1, lists:reverse(Value)}
    end;
parse_prop_value([H|T], Line, States, Value) ->
    parse_prop_value(T, Line, States, [H|Value]).

parse_param([$=|T], Line, States, Name) ->
    ParamName = list_to_atom(string:to_lower(lists:reverse(Name))),
    {Rest, Line1, Value} = parse_param_value(T, Line, States, []),
    {Rest, Line1, ParamName, Value};
parse_param([H|T], Line, States, Name) ->
    parse_param(skip_ws(T), Line, States, [H|Name]).

parse_param_value([$;|T], Line, States, Value) ->
    {T, Line, lists:reverse(Value)};
parse_param_value([$:|T], Line, States, Value) ->
    %% keep $: for end of prop name
    {[$:|T], Line, lists:reverse(Value)};
parse_param_value([H|T], Line, States, Value) ->
    parse_param_value(T, Line, States, [H|Value]).


unfolding_line([$\s|T]) -> {true,  T}; %% space
unfolding_line([$\t|T]) -> {true,  T}; %% htab
unfolding_line(Chars)   -> {false, Chars}.
    
skip_ws([$\s|T]) -> skip_ws(T);
skip_ws([$\t|T]) -> skip_ws(T);
skip_ws(Text) -> Text.


test() ->
    Text = "
BEGIN:VCALENDAR
METHOD:PUBLISH
X-WR-CALNAME:Mi's Calendar
VERSION:2.0
PRODID:Spongecell
CALSCALE:GREGORIAN
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061206T120000
DTSTAMP:20070728T004842
LOCATION:Gordon Biersch, 640 Emerson St, Palo Alto, CA
URL:
UID:295803:spongecell.com
SUMMARY:All hands meeting
RRULE:FREQ=WEEKLY;INTERVAL=1
DTEND;TZID=America/Los_Angeles:20061206T130000
DESCRIPTION:
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061207T120000
DTSTAMP:20070728T004842
LOCATION:395 ano nuevo ave\, sunnyvale\, ca
URL:
UID:295802:spongecell.com
SUMMARY:Company lunch
RRULE:FREQ=WEEKLY;INTERVAL=1
DTEND;TZID=America/Los_Angeles:20061207T130000
DESCRIPTION:Let's have lots of beer!! (well\, and some code review :)
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061213T123000
DTSTAMP:20070728T004842
LOCATION:369 S California Ave\, Palo Alto\, CA
URL:
UID:295714:spongecell.com
SUMMARY:Ben is back.. want to meet again
DTEND;TZID=America/Los_Angeles:20061213T133000
DESCRIPTION:Re: Ben is back.. want to meet again\n Marc
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20070110T200000
DTSTAMP:20070728T004842
LOCATION:
URL:
UID:304529:spongecell.com
SUMMARY:flight back home
DTEND;TZID=America/Los_Angeles:20070110T210000
DESCRIPTION:
END:VEVENT
BEGIN:VTIMEZONE
TZID:America/Los_Angeles
BEGIN:STANDARD
DTSTART:20071104T000000
TZNAME:PST
RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU
TZOFFSETFROM:-0700
TZOFFSETTO:-0800
END:STANDARD
BEGIN:DAYLIGHT
DTSTART:20070311T000000
TZNAME:PDT
RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=1SU
TZOFFSETFROM:-0800
TZOFFSETTO:-0700
END:DAYLIGHT
END:VTIMEZONE
END:VCALENDAR


",
    io:fwrite(user, "Text: ~s~n", [Text]),
    {ok, Xml} = parse(Text),
    XmlText = lists:flatten(xmerl:export_simple([Xml], xmerl_xml)),
    io:fwrite(user, "Parsed: ~n~p~n", [XmlText]).

You may have noticed, the ?stateMachine can be pointed to a json_machine:state/2 some day, and we can get a JSON result without modification of icalendar.erl.

This also can be applied on JSON<->XML transform. Actually, I think SAX events is a good interface for various formats transform of data object. It's also a bit Erlang Style (Event passing). The parser/state-machine can communicate via SAX events as two separate processes and live with send/receive.

分享到:
评论

相关推荐

    Sublime.Text.Build.3078._Win_32bit破解主文件

    Build Systems: Renamed "keyfile" to "keyfiles", now accepting a list of files that can trigger the build system (e.g., ["Makefile", "makefile"]) Improved change detection for files that disappear and ...

    Beginning Python (2005).pdf

    Deciding When to Use DBM and When to Use a Relational Database 255 Working with Relational Databases 255 Writing SQL Statements 257 Defining Tables 259 Setting Up a Database 260 Try It Out: ...

    Python Programming 8 Simple Steps to Learn Python in 24 hours

    Python Programming: 8 Simple Steps to Learn Python Programming Language in 24 hours! Practical Python Programming for Beginners, Python Commands and Python Language (Python Programming Crush Course) ...

    Asking the Right Questions: A Guide to Critical Thinking (8th Edition)

    It teaches them to respond to alternative points of view and develop a solid foundation for making personal choices about what to accept and what to reject as they read and listen. Chapter titles ...

    Using LUA with Visual C++ (Introduction)

    The first one is the pointer to the LUA state, the second one is a pointer to a user-defined reader function, the third pointer is a user-defined value that the reader function will receive, and the ...

    Solaris 10 System Administration Essentials

    8.5.4 How to Identify a Defective Sector by Performing a Surface Analysis 221 8.5.5 How to Repair a Defective Sector 222 8.5.6 How to Display the Partition Table or Slice Information 223 8.5.7 ...

    Google C++ Style Guide(Google C++编程规范)高清PDF

    To guarantee uniqueness, they should be based on the full path in a project's source tree. For example, the file foo/src/bar/baz.h in project foo should have the following guard: #ifndef FOO_BAR_BAZ...

    Take.My.Money.Accepting.Payments.on.the.Web.1680501992

    By the end, you will know how to create a fully-functional web payment-taking machine. What You Need: The code in this book works with Ruby 2.3.1 and Rails 5, though nearly all of the code will run ...

    sed-awk-2nd-edition.chm

    The book begins with an overview and a tutorial that demonstrate a progression in functionality from grep to sed to awk. sed and awk share a similar command-line syntax, accepting user instructions in...

    Professional Python Frameworks - Web 2.0 Programming with Django and TurboGears

    An Example: A Simple Blog Site 104 Creating the Project 105 Defining the Model 105 Using TGCrud 108 Creating the Blog Author Views with TGCrud 110 Creating the Blog Entry Views with TGCrud 118 ...

    Entity Framework Code First Succinctly(Syncfusion,2014)

     Simple to use: there are no XML schemas to master, no base classes to inherit from, no arcane interfaces to implement, and it has a clean, tidy API. You just focus on the actual domain model and ...

    Accepting-Payment:接受付款

    Accepting-Payment:接受付款

    Manning.Spring.in.Action.4th.Edition.2014.11.epub

    17.1. A brief introduction to asynchronous messaging 17.1.1. Sending messages 17.1.2. Assessing the benefits of asynchronous messaging 17.2. Sending messages with JMS 17.2.1. Setting up a message ...

    Mastering Linux Shell Scripting 2nd Edition

    Then, you'll learn how to write a simple bash script and how to edit your bash script using Linux editors. Following this, you will learn how to define a variable and the visibility of a variable. ...

    rfc全部文档离线下载rfc1-rfc8505

    possible to query an IMP about the state of a link (although it might be possible to query an IMP about the recent history of a link -- quite a different matter!). The other primitive ...

    visual assist 10.5.1707

    The text caret is placed in the correct location after accepting a .NET Generic from a suggestion list in VB 2008. (case=20259) 8257 When typing a parameter in a C# LINQ predicate function, focus is ...

    IOS5 Programming Cookbook

    1.1 Creating a Simple iOS App in Xcode 2 1.2 Understanding Interface Builder 7 1.3 Compiling iOS Apps 13 1.4 Running iOS Apps on the Simulator 15 1.5 Running iOS Apps on iOS Devices 17 1.6 Packaging ...

    visual assist 1707破解

    * The text caret is placed in the correct location after accepting a .NET Generic from a suggestion list in VB 2008. (case=20259) 8257 * When typing a parameter in a C# LINQ predicate function, ...

    Bad Programming Practices 101-Apress(2018).pdf

    So, you’re a programmer, or at least a programmer-in-training. You want to improve your programming skills. You want to become more productive as soon as possible. You’ll be working with colleagues ...

Global site tag (gtag.js) - Google Analytics