ad_replicate_web_robots_db dbWhat it does:
Replicates data from the Web Robots Database (http://info.webcrawler.com/mak/projects/robots/active.html) into a table in the ACS database. The data is published on the Web as a flat file, whose format is specified in http://info.webcrawler.com/mak/projects/robots/active/schema.txt. Basically, each non-blank line of the database corresponds to one field (name-value pair) of a record that defines the characteristics of a registered robot. Each record has a "robot-id" field as a unique identifier. (There are many fields in the schema, but, for now, the only ones we care about are: robot-id, robot-name, robot-details-url, and robot-useragent.)\nDefined in: /web/philip/tcl/ad-robot-defs.tclReturns the number of rows replicated. May raise a Tcl error that should be caught by the caller.
Source code:
set web_robots_db_url [ad_parameter WebRobotsDB robot-detection] set result [ns_geturl $web_robots_db_url headers] set page [split $result "\n"] # A set in which to store the fields of a record as we # process the file. set robot [ns_set create] set robot_count 0 foreach line $page { # A "robot-id" line delimits a new record, so each # time we encounter one, we need to write the prior # record (if there is one) into the database. There # is only case in which there will *not* be a prior # record, i.e., for the very first record. # if [regexp "robot-id: *(.+)" $line match robot_id] { set prior_robot_id [ns_set get $robot "robot_id"] if ![empty_string_p $prior_robot_id] { # As long as there is an actual value for # "robot_useragent", load the record, i.e., # update it if a record with the same # robot_id already exists or insert it if # one does not. (There's no point in keeping # info about robots that we can't identify.) # if ![empty_string_p [ns_set get $robot "robot_useragent"]] { if [robot_exists_p $db $prior_robot_id] { ns_log Notice "Updating existing robot: $robot_id" ns_db dml $db "update robots set robot_name = '[DoubleApos [ns_set get $robot "robot_name"]]', robot_details_url = '[DoubleApos [ns_set get $robot "robot_details_url"]]', robot_useragent = '[DoubleApos [ns_set get $robot "robot_useragent"]]' where robot_id = '[DoubleApos $prior_robot_id]'" } else { ns_log Notice "Inserting new robot: $robot_id" ns_db dml $db "insert into robots(robot_id, robot_name, robot_details_url, robot_useragent) values('[DoubleApos $prior_robot_id]', '[DoubleApos [ns_set get $robot "robot_name"]]', '[DoubleApos [ns_set get $robot "robot_details_url"]]', '[DoubleApos [ns_set get $robot "robot_useragent"]]')" } incr robot_count } # Clear out the record so we can start anew. # ns_set delkey $robot "robot_id" ns_set delkey $robot "robot_name" ns_set delkey $robot "robot_details_url" ns_set delkey $robot "robot_useragent" } ns_set put $robot "robot_id" [string trim $robot_id] } if [regexp "robot-name: *(.+)" $line match robot_name] { ns_set put $robot "robot_name" [string trim $robot_name] } if [regexp "robot-details-url: *(.+)" $line match robot_details_url] { ns_set put $robot "robot_details_url" [string trim $robot_details_url] } if [regexp "robot-useragent: *(.+)" $line match robot_useragent] { ns_set put $robot "robot_useragent" [string trim $robot_useragent] } } # Don't forget the last record. # if ![empty_string_p [ns_set get $robot "robot_useragent"]] { if [robot_exists_p $db $prior_robot_id] { ns_log Notice "Updating existing robot: $robot_id" ns_db dml $db "update robots set robot_name = '[DoubleApos [ns_set get $robot "robot_name"]]', robot_details_url = '[DoubleApos [ns_set get $robot "robot_details_url"]]', robot_useragent = '[DoubleApos [ns_set get $robot "robot_useragent"]]', insertion_date = sysdate where robot_id = '[DoubleApos $prior_robot_id]'" } else { ns_log Notice "Inserting new robot: $robot_id" ns_db dml $db "insert into robots(robot_id, robot_name, robot_details_url, robot_useragent) values('[DoubleApos $prior_robot_id]', '[DoubleApos [ns_set get $robot "robot_name"]]', '[DoubleApos [ns_set get $robot "robot_details_url"]]', '[DoubleApos [ns_set get $robot "robot_useragent"]]')" } incr robot_count } return $robot_count