- 浏览: 299496 次
文章分类
最新评论
-
shoru:
最新的netbeans scala插件无法安装到6.9中呀,求 ...
Scala-对Java的修正和超越-Presentation -
alanwu:
night_stalker 写道讲一次多少钱啊,能透露下么…… ...
Scala-对Java的修正和超越-Presentation -
wzjin:
暂时没有考虑学习,等有一定市场再看!
Scala-对Java的修正和超越-Presentation -
dcaoyuan:
jasspier 写道对scala有点兴趣,有没有入门的推荐教 ...
Scala-对Java的修正和超越-Presentation -
jasspier:
草原兄,听了你的课,对scala有点兴趣,有没有入门的推荐教程 ...
Scala-对Java的修正和超越-Presentation
Playing with Tim's Erlang Exercise is so much fun.
I've been coding in Erlang about 6 months as a newbie, in most cases, I do parsing on string (or list what ever) with no need of regular expressions, since Erlang's pattern match can usaully solve most problems straightforward.
Tim's log file is also a good example for applying pattern match in Erlang way. It's a continuous stream of dataset, after splitting it to line-bounded chunks for parallellization purpose, we can truely match whole {GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) } directly on chunk with no need to split to lines any more.
This come out my third solution, which matchs whole
{GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) }
likeness using the pattern:
"GET /ongoing/When/"++[_,_,_,$x,$/,Y1,Y2,Y3,Y4,$/,M1,M2,$/,D1,D2,$/|Rest]
and then fetchs
[Y1,Y2,Y3,Y4,$/,M1,M2,$/,D1,D2,$/] ++ match_until_space_newline(Rest, [])
as the matched key, with no need to split the chunk to lines.
But yes, we still need to split each chunk on the lastest newline to get parallelized result exactly accurate.
On my 2-core 2 GHz MacBook, the best time I’ve got is 4.483 sec
# smp enabled: $ erlc -smp tbray3.erl $ time erl -smp +P 60000 -noshell -run tbray3 start o1000k.ap -s erlang halt 8900 : <<"2006/09/29/Dynamic-IDE">> 2000 : <<"2006/07/28/Open-Data">> 1300 : <<"2003/07/25/NotGaming">> 800 : <<"2003/10/16/Debbie">> 800 : <<"2003/09/18/NXML">> 800 : <<"2006/01/31/Data-Protection">> 700 : <<"2003/06/23/SamsPie">> 600 : <<"2006/09/11/Making-Markup">> 600 : <<"2003/02/04/Construction">> 600 : <<"2005/11/03/Cars-and-Office-Suites">> Time: 4142.83 ms real 0m4.483s user 0m5.804s sys 0m0.615s # no-smp: $ erlc tbray3.erl $ time erl -noshell -run tbray_list_no_line start o1000k.ap -s erlang halt real 0m7.050s user 0m6.183s sys 0m0.644s
The smp enable result speedup about 57%
On the 2.80GHz 4-cpu xeon debian box that I mentioned before in previous blog, the best result is:
real 0m8.420s user 0m11.637s sys 0m0.452s
And I've noticed, adjusting the BUFFER_SIZE can balance the time consumered by parallelized parts and un-parallelized parts. That is, if the number of core is increased, we can also increase the BUFFER_SIZE a bit, so the number of chunks decreased (less un-parallelized split_on_last_new_line/1 and file:pread/3) but with more heavy work for parallelized binary_to_list/1 and scan_chunk/1 on longer list.
The best BUFFER_SIZE on my computer is 4096 * 5 bytes, which causes un-parallized split_on_last_newline/1 took about only 0.226s in the case.
The code:
-module(tbray3). -compile([native]). -export([start/1]). %% The best Bin Buffer Size is 4096 * 1 - 4096 * 5 -define(BUFFER_SIZE, (4096 * 5)). start(FileName) -> Start = now(), Main = self(), Collector = spawn(fun () -> collect_loop(Main) end), {ok, File} = file:open(FileName, [raw, binary]), read_file(File, Collector), %% don't terminate, wait here, until all tasks done. receive stop -> io:format("Time: ~10.2f ms~n", [timer:now_diff(now(), Start) / 1000]) end. read_file(File, Collector) -> read_file_1(File, [], 0, Collector). read_file_1(File, PrevTail, I, Collector) -> case file:read(File, ?BUFFER_SIZE) of eof -> Collector ! {chunk_num, I}, file:close(File); {ok, Bin} -> {Chunk, NextTail} = split_on_last_newline(PrevTail ++ binary_to_list(Bin)), spawn(fun () -> Collector ! {dict, scan_chunk(Chunk)} end), read_file_1(File, NextTail, I + 1, Collector) end. split_on_last_newline(List) -> split_on_last_newline_1(lists:reverse(List), []). split_on_last_newline_1(List, Tail) -> case List of [] -> {lists:reverse(List), []}; [$\n|Rest] -> {lists:reverse(Rest), Tail}; [C|Rest] -> split_on_last_newline_1(Rest, [C | Tail]) end. collect_loop(Main) -> collect_loop_1(Main, dict:new(), undefined, 0). collect_loop_1(Main, Dict, ChunkNum, ChunkNum) -> print_result(Dict), Main ! stop; collect_loop_1(Main, Dict, ChunkNum, ProcessedNum) -> receive {chunk_num, ChunkNumX} -> collect_loop_1(Main, Dict, ChunkNumX, ProcessedNum); {dict, DictX} -> Dict1 = dict:merge(fun (_, V1, V2) -> V1 + V2 end, Dict, DictX), collect_loop_1(Main, Dict1, ChunkNum, ProcessedNum + 1) end. print_result(Dict) -> SortedList = lists:reverse(lists:keysort(2, dict:to_list(Dict))), [io:format("~b\t: ~p~n", [V, K]) || {K, V} <- lists:sublist(SortedList, 10)]. scan_chunk(List) -> scan_chunk_1(List, dict:new()). scan_chunk_1(List, Dict) -> case List of [] -> Dict; "GET /ongoing/When/"++[_,_,_,$x,$/,Y1,Y2,Y3,Y4,$/,M1,M2,$/,D1,D2,$/|Rest] -> case match_until_space_newline(Rest, []) of {Rest1, []} -> scan_chunk_1(Rest1, Dict); {Rest1, Word} -> Key = list_to_binary([Y1,Y2,Y3,Y4,$/,M1,M2,$/,D1,D2,$/, Word]), scan_chunk_1(Rest1, dict:update_counter(Key, 1, Dict)) end; [_|Rest] -> scan_chunk_1(Rest, Dict) end. match_until_space_newline(List, Word) -> case List of [] -> {[], []}; [10|_] -> {List, []}; [$.|_] -> {List, []}; [$ |_] -> {List, lists:reverse(Word)}; [C|Rest] -> match_until_space_newline(Rest, [C | Word]) end.
I also wrote another corresponding binary version, which is 2-3 times slower than above list version on my machine, but the result may vary depending on your compiled Erlang/OTP on various operation system. I will test it again when Erlang/OTP R12B is released, which is claimed to have been optimized for binary match performance.
The code:
-module(tbray3_bin). -compile([native]). -export([start/1]). -define(BUFFER_SIZE, (4096 * 10000)). start(FileName) -> Start = now(), Main = self(), Collector = spawn(fun () -> collect_loop(Main) end), {ok, File} = file:open(FileName, [raw, binary]), read_file(File, Collector), %% don't terminate, wait here, until all tasks done. receive stop -> io:format("Time: ~p ms~n", [timer:now_diff(now(), Start) / 1000]) end. collect_loop(Main) -> collect_loop_1(Main, dict:new(), undefined, 0). collect_loop_1(Main, Dict, ChunkNum, ChunkNum) -> print_result(Dict), Main ! stop; collect_loop_1(Main, Dict, ChunkNum, ProcessedNum) -> receive {chunk_num, ChunkNumX} -> collect_loop_1(Main, Dict, ChunkNumX, ProcessedNum); {dict, DictX} -> Dict1 = dict:merge(fun (_, V1, V2) -> V1 + V2 end, Dict, DictX), collect_loop_1(Main, Dict1, ChunkNum, ProcessedNum + 1) end. print_result(Dict) -> SortedList = lists:reverse(lists:keysort(2, dict:to_list(Dict))), [io:format("~b\t: ~s~n", [V, K]) || {K, V} <- lists:sublist(SortedList, 10)]. read_file(File, Collector) -> read_file(File, <<>>, 0, Collector). read_file(File, PrevTail, I, Collector) -> case file:read(File, ?BUFFER_SIZE) of eof -> file:close(File), Collector ! {chunk_num, I}; {ok, Bin} -> {Data, NextTail} = split_on_last_newline(Bin), spawn(fun () -> Collector ! {dict, scan_chunk(<<PrevTail/binary, Data/binary>>)} end), read_file(File, NextTail, I + 1, Collector) end. split_on_last_newline(Bin) -> split_on_last_newline(Bin, size(Bin)). split_on_last_newline(Bin, Offset) -> case Bin of <<Data:Offset/binary,$\n,Tail/binary>> -> {Data, Tail}; _ when Offset =< 0 -> {Bin, <<>>}; _ -> split_on_last_newline(Bin, Offset - 1) end. scan_chunk(Bin) -> scan_chunk_1(Bin, 0, dict:new()). scan_chunk_1(Bin, Offset, Dict) when Offset < size(Bin) - 34 -> case Bin of <<_:Offset/binary,"GET /ongoing/When/",_,_,_,$x,$/,Y1,Y2,Y3,Y4,$/,M1,M2,$/,D1,D2,$/,Rest/binary>> -> case match_until_space_newline(Rest, 0) of {Rest1, <<>>} -> scan_chunk_1(Rest1, 0, Dict); {Rest1, Word} -> Key = <<Y1,Y2,Y3,Y4,$/,M1,M2,$/,D1,D2,$/, Word/binary>>, scan_chunk_1(Rest1, 0, dict:update_counter(Key, 1, Dict)) end; _ -> scan_chunk_1(Bin, Offset + 1, Dict) end; scan_chunk_1(_, _, Dict) -> Dict. match_until_space_newline(Bin, Offset) -> case Bin of <<Word:Offset/binary,$ ,Rest/binary>> -> {Rest, Word}; <<_:Offset/binary,$.,Rest/binary>> -> {Rest, <<>>}; <<_:Offset/binary,10,Rest/binary>> -> {Rest, <<>>}; <<_:Offset/binary,_,_/binary>> -> match_until_space_newline(Bin, Offset + 1); _ -> {<<>>, <<>>} end.
发表评论
-
Updated "From Rails to Erlyweb" Part I and Part II
2007-07-30 00:13 831I've updated "From Rails t ... -
From Rails to Erlyweb - Part IV
2007-07-30 00:14 876IV. Support Mysql Spatial Exten ... -
HTML Entity Refs and xmerl
2007-07-30 00:14 934According to [erlang-bugs] xme ... -
From Rails to Erlyweb - Part III
2007-07-30 00:14 8373. The Magic Behind Erlyweb Wit ... -
From Rails to Erlyweb - Part I
2007-08-05 06:34 1196Updated July 20 2007: new param ... -
ErlyBird 0.12.0 released - Erlang IDE based on NetBeans
2007-08-08 18:40 862I'm pleased to announce ErlyBir ... -
A Simple POET State Machine Accepting SAX Events to Build Plain Old Erlang Term
2007-08-20 07:19 951Per previous blogs: A Simple ... -
A Simple XML State Machine Accepting SAX Events to Build xmerl Compitable XML Tree: icalendar demo
2007-08-20 07:23 1095xmerl is a full XML functionali ... -
recbird - An Erlang Dynamic Record Inferring Parse Transform
2007-08-23 08:31 1015You should have read Yariv's re ... -
From Rails to Erlyweb - Part II
2007-08-23 21:16 1226Updated Aug 23: Please see Fro ... -
From Rails to Erlyweb - Part II Manage Project - Reloaded
2007-08-24 19:20 1318The migrating from Rails to Erl ... -
Parse JSON to xmerl Compitable XML Tree via A Simple XML State Machine
2007-09-02 20:00 1492Updated Aug 16: Fix bugs when j ... -
Tim Bray's Erlang Exercise on Large Dataset Processing
2007-10-10 20:12 1208Updated Oct 10: pread_file/5 s ... -
Tim Bray's Erlang Exercise on Large Dataset Processing - Round II
2007-10-15 15:37 1581Updated Oct 09: Added more ben ... -
Reading File in Parallel in Erlang (Was Tim Bray's Erlang Exercise - Round III)
2007-10-15 19:56 1864My first solution for Tim's ex ... -
Learning Coding Parallelization (Was Tim's Erlang Exercise - Round V)
2007-10-17 21:58 1526Updated Oct 16: After testing ...
相关推荐
esl-erlang_16.b.3-2~centos~6_amd64.rpm
esl-erlang_25.0.3-1_centos_7_amd64.rpm erlang_25.0.3-1 rpm用于rabitmq基础环境安装
esl-erlang_24.2.1-1_centos_7_amd64.rpm
erlang-xmerl-22.2.3-1.el7.x86_64.rpm 免费下载0积分镜像下载。rabbitMQ安装需要依赖此环境。Erlang 是一种多用途编程语言,主要用于开发并发和分布式系统。它最初是一种专有的编程语言,Ericsson 使用它来开发电话和...
erlang-24.3.3-1.el9.x86_64.rpm centos
erlang-xmerl-23.0.2-2.el7.x86_64.rpm,rabbitMQ安装需要依赖此环境。Erlang 是一种多用途编程语言,主要用于开发并发和分布式系统。它最初是一种专有的编程语言,Ericsson 使用它来开发电话和通信应用程序。
rabbitmq-server-3.8.17-1.el8.noarch.rpm可用的erlang
最新最全rabbitmq与erlang版本匹配-2020-04-23(world版)
This package contains the Erlang/OTP runtime implementation, which is configured and built with HiPE support (allows compiling to native code), and minimal set of Erlang applications: compiler - ...
rpm包; 包含erlang-23.3-rabbitmq-server-3.8
包括erlang-23.3.4.3-1.el7.x86_64.rpm和rabbitmq-server-3.8.17-1.el7.noarch.rpm以及安装步骤
erlang-crypto-19.3.6.4-1.el7.x86_64.rpm
erlang-asn1-19.3.6.4-1.el7.x86_64.rpm
erlang-xmerl-22.2.5-1.el7.x86_64.rpm 免费下载0积分镜像下载。rabbitMQ安装需要依赖此环境。Erlang 是一种多用途编程语言,主要用于开发并发和分布式系统。它最初是一种专有的编程语言,Ericsson 使用它来开发电话和...
rabbitMQ安装需要依赖erlang环境,这里提供centOS erlang21.3.1版本免费下载,适用于rabbitMQ对应版本
erlang-rpm-macros-0.1.4-1.el7.noarch.rpm
erlang-jinterface-19.3.6.4-1.el7.x86_64.rpm
erlang 安装包。 rabbitmq 安装所需要的
erlang-solutions-1.0-1.noarch.rpm erlang-solutions-1.0-1.noarch.rpm
erlang-23.1-1.el8.x86_ ,erlang官网下载很慢,所以提供该下载链接。