var http = require('http'),
fs = require('fs');
var toMarkdown = require('to-markdown').toMarkdown;
var htmlToText = require('html-to-text');
var websiteName = "zerojudge";
var websitePage = 141;
var websiteDomain = 'http://mypaper.pchome.com.tw/';
var url = websiteDomain + websiteName;
var loadprocess = 0;
clawingWebSite(url, websitePage);
function clawingWebSite(url, websitePage) {
for(var i = 0; i <= websitePage; i++) {
if(i == 0)
clawingWebPage(url);
else
clawingWebPage(url + "/P" + i);
}
}
function clawingWebPage(url) {
readWebPage(url);
}
function branchWebPage(source) {
var sourceIndex = source.indexOf('class="blog"');
while(sourceIndex >= 0) {
var pageLink = source.indexOf(websiteName + '/post', sourceIndex);
if(pageLink > 0) {
var endLink = source.indexOf('"', pageLink);
var articleLink = source.substr(pageLink, endLink - pageLink);
readWebArticleLink(websiteDomain + articleLink, articleLink);
sourceIndex = pageLink + 1;
sourceIndex = source.indexOf('title brk_h', sourceIndex);
} else {
break;
}
}
}
function parsingArticlePage(source, fileName) {
var contentBody = "";
var articleTitle = "";
var mdFormat = "";
var sourceIndex = source.indexOf('name="keywords"');
sourceIndex = source.indexOf('content="', sourceIndex);
var nextIndex = source.indexOf('"', sourceIndex + 10);
articleTitle = source.substr(sourceIndex + 9, nextIndex - (sourceIndex + 9));
sourceIndex = nextIndex;
console.log('Title = ' + articleTitle);
sourceIndex = source.indexOf('<div class="innertext brk_h"');
var endIndex = source.indexOf('<div id="ArticleMapTitle"');
contentBody = source.substr(sourceIndex, endIndex - sourceIndex - 1);
contentBody += "</div>";
var sourceBody = contentBody;
if(endIndex - sourceIndex - 1 <= 0)
return;
articleTitle = articleTitle.replace(/\[/g, "【");
articleTitle = articleTitle.replace(/\]/g, "】");
var text = htmlToText.fromString(sourceBody, {
wordwrap: 130
});
contentBody = toMarkdown(contentBody);
mdFormat += "title: " + articleTitle + "\n";
mdFormat += "date: 2014-04-10 20:10:28" + "\n";
mdFormat += "tags: " + "\n";
var tagIndex = source.indexOf('<div id="article_tag">');
if(tagIndex > 0) {
tagIndex += '<div id="article_tag">'.length + 1;
var tagTail = source.indexOf('<div id="article_author" align="right">');
var tagEndIndex;
while(true) {
var tagStartIndex = source.indexOf('search_fields=tag">', tagIndex);
tagStartIndex += 'search_fields=tag">'.length;
if(tagStartIndex < 0 || tagStartIndex > tagTail)
break;
tagEndIndex = source.indexOf('</a>', tagStartIndex);
var articleTag = source.substr(tagStartIndex, tagEndIndex - tagStartIndex);
console.log(articleTag);
mdFormat += "- " + articleTag + "\n";
tagIndex = tagEndIndex;
}
}
mdFormat += "---" + "\n\n";
text = " " + text;
text = text.replace(/\n/g, "\n ");
mdFormat += text;
loadprocess++;
fs.open('sourcePage' + loadprocess + '.html', 'w', 0666, function(e, fd) {
if(e) {
console.log('错误信息:' + e);
} else {
fs.write(fd, source, 0, 'utf8', function(e) {
if(e) {
console.log('出错信息:' + e);
} else {
fs.closeSync(fd);
}
});
}
});
fs.open('textPC' + loadprocess + '.md', 'w', 0666, function(e, fd) {
if(e) {
console.log('错误信息:' + e);
} else {
fs.write(fd, mdFormat, 0, 'utf8', function(e) {
if(e) {
console.log('出错信息:' + e);
} else {
fs.closeSync(fd);
}
});
}
});
fs.open('oldPC' + loadprocess + '.md', 'w', 0666, function(e, fd) {
if(e) {
console.log('错误信息:' + e);
} else {
fs.write(fd, contentBody, 0, 'utf8', function(e) {
if(e) {
console.log('出错信息:' + e);
} else {
fs.closeSync(fd);
}
});
}
});
}
function readWebArticleLink(url) {
http.get(url, function(res) {
var source = "";
res.on('data', function(data) {
source += data;
});
res.on('end', function() {
parsingArticlePage(source);
});
}).on('error', function() {
console.log("获取数据出现错误");
});
}
function readWebPage(url) {
http.get(url, function(res) {
var source = "";
res.on('data', function(data) {
source += data;
});
res.on('end', function() {
branchWebPage(source);
});
}).on('error', function() {
console.log("获取数据出现错误");
});
}