sys.config

%% -*- mode: erlang -*-

[
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %% SASL config
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 {sasl, [
	 {sasl_error_logger, {file, "priv/log/sasl-error.log"}},
	 {errlog_type, error},
	 {error_logger_mf_dir, "priv/log/sasl"},      % Log directory
	 {error_logger_mf_maxbytes, 10485760},   % 10 MB max file size
	 {error_logger_mf_maxfiles, 5}           % 5 files max
	]
 },
 {ebot, [

	 %% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	 %% see EBOT options in ebot.app and add your changes here! 
	 %% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	 %% CACHE
	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	 %% DATABASE
	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	 %%
	 %% you need to set the db backend (COUCHDB or RIAK)
	 %% in src/ebot.hrl file
	 {db_hostname, "127.0.0.1"},
	 %% COUCHDB
	 %%{db_port, 5984},
	 %% RIAK 
	 {db_port, 8087},

	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	 %% MQ
	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	 %% WEB
	 %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	 %% -------------------------------------------------------------------------------------------------
	 %% normalize_url_list
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% {normalize_url_list, [{RE,  NormalizeUrlOptions},..]}
	 %%
	 %% options of normalize_url : 
	 %%    add_final_slash
	 %%    to_lower_case : urls are case insensive and some web pages have links with some uppercase letters..
	 %%    without_internal_links
	 %%    without_queries,
	 %%    {max_depth, 2}
	 %%          the url path will be truncated to a max_depth path 
	 %%          http://www.redaelli.org/matteo/blog/a/ ->  http://www.redaelli.org/matteo/blog/
	 %%          should be the same as "tot_new_urls_queues" in ebot_mq.conf
	 %%	    you should also start at least one crawler for depth in [0,max_depth]. see "worker_pools" in this file
	 %%    (TODO)  {remove_filename, false}
	 {normalize_url, 
	  [
	   %% {\\.com/", 
	   %%  [	
	   %% 	{plugin, ebot_url_util, url_domain},
	   %% 	add_final_slash,
	   %% 	to_lower_case
	   %%  ]
	   %% },

	   %% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	   %% default setting for normalize_url
	   %% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	   %% rememeber, at least one regexp must match all urls
	   %% "." should be used
	   {".", 
	    [
	     %% -------------------------------------------------------------------
	     %% {plugin, Module, Function/1}
	     %% -------------------------------------------------------------------
	     %% you can call a custom module:function(Url) for normaling urls
		  
	     %% are you interested only in domain homepages?
	     %% {plugin, ebot_url_util, url_domain},
		  
	     %% removing blank characters at the begin and end of url string:
	     %% yes, sometime happens!
	     strip,

	     %% -------------------------------------------------------------------
	     %% {replace_string, [{from,to},..]}
	     %% -------------------------------------------------------------------	
	     {replace_string, [
			       %% http://www.gettyre.it/motoweb/XXX;jsessionid=250485C.sae_1
			       {";[A-Za-z0-9]+=[^&;?]+", ""},
			       %% some sites have newlines in url links: 
			       %% see http://opensource.linux-mirror.org/index.php
			       %% TODO maybe it still doesn t work
			       {"\n",""},
			       %% http://github.com/dizzyd/ibrowse
			       {"&quot\$",""}
			      ]},
	     %% -------------------------------------------------------------------
	     %% add_final_slash
	     %% -------------------------------------------------------------------
	     %% example: http://www.redaelli.org => http://www.redaelli.org/
	     add_final_slash,
	     
	     %% -------------------------------------------------------------------
	     %% {max_depth, 3}
	     %% -------------------------------------------------------------------
	     %% paths > max_depth are truncated to max_depth
	     %% for instance, if {max_depth,0}
	     %% http://www.redaelli.org/matteo/ => http://www.redaelli.org/
	     {max_depth, 4}, 
				
	     %% -------------------------------------------------------------------
	     %% to_lower_case
	     %% -------------------------------------------------------------------
	     %% for some web servers web urls are case insensitive.
	     %% it is safer to lowecase all urls in order to avoid duplicates in the database
	     %% {plugin, string, to_lower},
	     
	     %% -------------------------------------------------------------------
	     %% without_internal_links
	     %% -------------------------------------------------------------------
	     %% internal links (#) are removed
	     without_internal_links,
	     
	     %% -------------------------------------------------------------------
	     %% without_queries
	     %% -------------------------------------------------------------------
	     %% parameters, like ?a=1&b=3, are removed from urls
	     without_queries
	    ]}%% end default
	  ] % end list of {regexp, ListOptions}
	 }, %% end normalize_url
	 
	 %% -------------------------------------------------------------------------------------------------
	 %% tobe_saved_headers
	 %% -------------------------------------------------------------------------------------------------
	 %% headers (if exist) that will be saved in the database
	 {tobe_saved_headers, 
	  [
	   <<"content-length">>,
	   <<"content-type">>,
	   <<"server">>,
	   <<"x-powered-by">>
	  ]},
	 %% -------------------------------------------------------------------------------------------------
	 %% is_valid_image
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% this option is useful to check links before converting them to absolute urls
	 %% when they are relative links
	 %% 
	 {is_valid_image, 
	  [
	   %% the url will be analyzed if ALL regexps will be satisfied
	   {validate_all_url_regexps, [
				       {nomatch, "\.bmp$"},
				       {nomatch, "\.raw$"}
				      ]
	   }
	  ]
	 },
	 %% -------------------------------------------------------------------------------------------------
	 %% is_valid_link
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% this option is useful to check links before converting them to absolute urls
	 %% when they are relative links
	 %% 
	 {is_valid_link, 
	  [
	   %% the url will be analyzed if ALL regexps will be satisfied
	   {validate_all_url_regexps, [
				       {nomatch, "feed:"},
				       {nomatch, "ftp:"},
				       {nomatch, "javascript:"},
				       {nomatch, "mailto:"},
				       {nomatch, "news:"}
				      ]
	    }
	  ]
	 },

	 %% -------------------------------------------------------------------------------------------------
	 %% is_valid_url
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 {is_valid_url, 
	  [
	   %% you can call your custom function that wull return true or false
	   %% 	{plugin, Module, function}, 

	   %% silly function : {plugin, erlang, is_list},
	   %% an url is valid if its mime type satify any of the following regexps
	   {validate_any_mime_regexps, [
					{match,   "^text/"}
					%%,{match,   "^image/"}
				       ]
	   },
	   %% the url will be analyzed if ALL the following regexps will be satisfied
	   {validate_all_url_regexps, [	   
				       {match, "^https?://"},
				       %% {nomatch, "^https"},

				       {nomatch, "//.+//"},
				       {nomatch, "/bugs/"},
				       {nomatch, "viewcvs"},
				       %% Skipping Apache.org urls
				       {nomatch, "\\.apache\\..+/dist/"},
				       {nomatch, "/snapshots/"},
				       {nomatch, "^http://mail-archives"},
				       {nomatch, "bugs.+/.+"},
				       %% apache mirror sites.. TODO
				       {nomatch, "apache\\.fastbull\\.org/.+"},
				       
				       %% Skipping unwanted files
				       {nomatch, "\\.deb$"},
			               {nomatch, "\\.git$"},
				       {nomatch, "\\.tgz$"},
				       {nomatch, "\\.jar$"},
			               {nomatch, "\\.rpm$"},
				       {nomatch, "\\.tar$"},
				       {nomatch, "\\.gz$"},
                                       {nomatch, "\\.makefile$"},
                                       {nomatch, "\\.Makefile$"},
				       % Skipping CVS repositories
				       {nomatch, "/cvs/\\."},
				       
				       %% Skipping Github unseful pages
				       {nomatch, "github\\.+/issues"},
				       {nomatch, "gist\\.github\\.com"},
				       %% the page gives incomplete header
				       {nomatch, "svn\\.github\\.com"},
				   
				       %% Skipping Gitorious unseful pages
				       {nomatch, "git.+/merge_requests/"},
				       {nomatch, "git.+/commits/"},
				       {nomatch, "git.+/trees/"},
				       
				       %% Skipping Git repositories	  		
				       {nomatch, "git.+/commit/"},
				       {nomatch, "git.+/tree/"},
				       
				       %% Skipping HG repositories
				       {nomatch, "/changeset/"},
				       
				       %% Skipping SVN repositories
				       {nomatch, "svn.+/viewvc/.+/"},
				       {nomatch, "/svn[\\./]"},
				       {nomatch, "/branches"},
				       {nomatch, "/trunk"},
				       {nomatch, "/tags"}
				  ]
	   }, %% end of validate_all_regexps
	   
	   %% The url will be analyzed if ANY of the following regexps will be satisfied.
	   %% Here you should put the list of web sites to be visited by ebot.
 
	   {validate_any_url_regexps, [
				       %% at least one regexp must be defined
				       %%  {match,"."},
                                       %{match, "redaelli\\.org"},
                                       %% Opensource projects
				       %{match, "apache\\.org"},
				       %{match, "freshmeat\\.net"},
				       %{match, "github\\.com"},
				       %{match, "code\\.google\\.com"},
				       %{match, "sourceforge\\.net"},
				       %{match, "ohloh\\.net"},
				       %{match, "bitbucket\\.org"},
                                       %{match, "www\\.gettyre\\.it"},
                                       %{match, "www\\.tyres-pneus-online\\.co\\.uk"}
				       {match, "linux\\.org\\.ru"},
				       {match, "newsru\\.com"}
				      ]
	   } 
	  ] %% end list of options 
	 } %% end is_valid_url
	 ,

	 %% -------------------------------------------------------------------------------------------------
	 %% obsolete_urls_after_day
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% after how many days, an url that is stored in the DB will become obsolete
	 
	 {obsolete_urls_after_days, 10},

	 %% -------------------------------------------------------------------------------------------------
	 %% save_referrals
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% (cumulable) values: external, domain, subdomain 
	 %%
	 %% domain: means same domain, 
	 %% es: true  <= http://www.redaelli.org/a and http://www.redaelli.org/
	 %%     false <= http://www.redaelli.org/a and http://redaelli.org/
	 %%
	 %% subdomain: means same main domain but not same domain, 
	 %% es: false <= http://www.redaelli.org/a and http://www.redaelli.org/
	 %%     true  <= http://www.redaelli.org/a and http://redaelli.org/
	 %%
	 %% external: means samenot same domain and not same main domain 
	 %% es: false <= http://www.redaelli.org/a and http://www.redaelli.org/
	 %%     false <= http://www.redaelli.org/a and http://redaelli.org/
	 %%     true  <=  http://www.redaelli.org/a and http://matteoredaelli.wordpress.com/
	 
	 {save_referrals, [external]},

	 %% -------------------------------------------------------------------------------------------------
	 %% workers_pool
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% how many crawler threads will be started for each candidated url queue/depth
	 %% {worker_pools, [{0,3},{1,2},{2,1}]} means
	 %% 3 crawlers will analyze urls got from AMQP queue ebot.new.0 that countains urls with depth==0 
	 %%   (ex. http://www.redaelli.org, http://www.redaelli.org/index.html)
	 %% 2 crawlers will analyze urls got from AMQP queue ebot.new.1 that countains urls with depth==1
	 %%   (ex. http://www.redaelli.org/matteo/, http://www.redaelli.org/matteo/index.html)
	 %% 1 crawlers will analyze urls got from AMQP queue ebot.new.2 that countains urls with depth==2

	 %%{workers_pool, [{0,4}, {1,2}, {2,1}] },
	 {workers_pool, [{0,2}, {1,2}, {2,2}, {3,2}, {4,2} ] },
	 %% -------------------------------------------------------------------------------------------------
	 %% start_workers_at_boot
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% are the crawlers started automatically at boot time? 
	 {start_workers_at_boot, true},

	 %% -------------------------------------------------------------------------------------------------
	 %% workers_sleep_time
	 %% -------------------------------------------------------------------------------------------------
	 %%
	 %% how many milliseconds will each crawler sleep between two url crawls?
	 %% this option is useful in order to avoid heavy workloads for the visited websites
	 %% and for the ebot system if the hardware is not enough powerful

	 {workers_sleep_time, 2000}
		 
	]
 }
].