我之前有一个spider,我列出部分设置项,火车头看对你新功能有无作用:
config.ini
------
[Aplaction]
ShowInTaskbar = 0
Tips=猪头软件国内文章搜索
//应用程序是否在任务栏中显示
[WebSite]
//检查网站有没更新的时间间隔。180秒,单位为秒。也就是说,这次分析完数据以后,180秒以后再去下载分析,相当于刷新参数
dTime= 18000
Url1 =
http://house.gznet.com/house/chouse_list.html
http://localhost/chouse_list.html
Name1 = gznet
gznet.ini
----------
[DownLoad]
UpLoadUrl =
http://.../test/gznet.asp
//下载分析完数据以后,需要把数据上传到指定的服务器
//将图片的相对路径转为绝对路径
BaseUrl =
http://house.gznet.com/house/
http://localhost/
//上次分析的最后一个页面,主要是用来检查页面有没有更新
LastestUrl =chouse_succ.php?MARK=look&HouseID=1800306523.0
//单位为秒,分析每个页面的间隔,主要为了防止被人家发现我们在读取它的数据
dTime = 1
StartUrl=
[Parser]
IndexRegex = <td><a\s+href="(chouse_succ\.php\?MARK=look&HouseID=\d+.\d\s*)"\s+target=
// 室
HeadRegex1 = <div align="right">户\s*型:</div>\s+</td>\s*<td\s+bgcolor="#FFFFFF">\s*(\d+)
// 厅
HeadRegex2 = 室\s*(\d*)\s*厅
// 卫
HeadRegex3 = 厅\s*(\d*)\s*卫
// 厨
HeadRegex4 = 卫\s*(\d*)\s*厨
// 类型
HeadRegex5 = <div align="right">类 型:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex6 = <div align="right">房\s*屋\s*座\s*向\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex7 = <div align="right">建\s*筑\s*年\s*份\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex8 = <div align="right">建\s*筑\s*面\s*积\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex9 = <div align="right">使\s*用\s*面\s*积\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex10 = <div align="right">租\s*金\s*价\s*格\s*:</div>\s*</td>\s*<td[^>]*>([\s\w\/(人民币) ]*)</td>
HeadRegex11 = <div align="right">房\s*屋\s*装\s*修\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex12 = <div align="right">住\s*宅\s*所\s*在\s*区\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex13 = <div align="right">小\s*区\s*名\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex14 = <div align="right">住\s*宅\s*地\s*址\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex15 = <div align="right">信\s*息\s*发\s*布\s*时\s*间\s*:</div>\s*</td>\s*<td[^>]*>([\s\w/:]*)</td>
HeadRegex16 = <div align="right">有\s*效\s*时\s*间\s*:</div>\s*</td>\s*<td[^>]*>([\s\w/:]*)</td>
HeadRegex17 = <div align="right">联\s*系\s*人\s*:</div>\s*</td>\s*<td[^>]*>([\s\w]*)</td>
HeadRegex18 = <div align="right">email\s*:</div>\s*</td>\s*<td[^>]*>([^<]*)</td>
HeadRegex19 = <div align="right">联系电话:</div>\s*</td>\s*<td[^>]*>([^<]*)</td>
HeadRegex20 = <div align="right">传 真:</div>\s*</td>\s*<td[^>]*>([^<]*)</td>
HeadRegex21 = <div align="right">联系地址:</div>\s*</td>\s*<td[^>]*>([^<]*)</td>
HeadRegex22 = <div align="right">详细说明:</div>\s*</td>\s*<td[^>]*>\s*<span class=font>\s*([\s\w,.&;<>]*)</span>
ContRegex = 个人
//HeadReIndex = (0)
HeadParam1 = ?室=
HeadParam2 = &厅=
HeadParam3 = &卫=
HeadParam4 = &厨=
HeadParam5 = &类型=
HeadParam6 = &座向=
HeadParam7 = &年份=
HeadParam8 = &面积=
HeadParam9 = &使用面积=
HeadParam10 = &价格=
HeadParam11 = &装修=
HeadParam12 = &区=
HeadParam13 = &小区=
HeadParam14 = &地址=
HeadParam15 = &发布时间=
HeadParam16 = &有效时间=
HeadParam17 = &联系人=
HeadParam18 = &email=
HeadParam19 = &联系电话=
HeadParam20 = &传真=
HeadParam21 = &地址=
HeadParam22 = &说明=
ContParam1 = &site=gznet