-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsoupdownloader.js
160 lines (138 loc) · 5.23 KB
/
soupdownloader.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
var fs = require('fs'),
lazy = require('lazy'),
async = require ('async'),
http = require('http');
var count = 0,
souppiccounter = 0,
soupRSSfilpath = process.argv[4], //'./soup_falk_2013-01-03.rss', // change to reflect yours
stream = fs.createReadStream(soupRSSfilpath),
options = {},
lastTitle = '',
metaData = process.argv[2]; // if you want a metadatafile to each image from the soup info
var parralelDLs = process.argv[3]; // how many pictures will be downloaded in parralel - adjust to soup speed of the day and you connection speed
var path = process.argv[5]; //'./soupImages/'; // change to reflect yours
// first we check if path exists if it does not we create it - this is synchronous as without a path there is no saving a picture
if (!fs.existsSync(path)) {
console.log('Creating Directory: ' + path);
fs.mkdirSync(path);
};
// some extracting and formating of stuff we found in the rss file - not much but at least a source and some tags and stuff
function cleverMetadataGenerator(souplineOBJ){
//console.log(souplineOBJ);
var sourceArray = new Array(),
mainURLArray = new Array(),
originalName = '',
mainURLName = '',
stringedsouplineOBJ = JSON.stringify(souplineOBJ).replace(new RegExp(',','g'),"\n").replace(new RegExp('\n','g'),"\n\n").replace(new RegExp('["{}]','g'),"").replace(new RegExp('(?!:\/\/):','g'),": ");
if ("source" in souplineOBJ && souplineOBJ.source != null){
if(souplineOBJ.source.charAt(souplineOBJ.source.toString().length-1 ) == "/") {
souplineOBJ.source = souplineOBJ.source.slice(0, -1);
}
var sourceArray = souplineOBJ.source.split('/');
var mainURLArray = sourceArray[2].split('.')
var mainURLName = mainURLArray[mainURLArray.length-2];
var regex = new RegExp('%..','gi');
var originalName = sourceArray[sourceArray.length-1].split('.')[0].replace(regex,'');
//console.log(originalName);
//console.log(mainURLName + '-' + originalName);
//console.log(stringedsouplineOBJ);
return {
filename: mainURLName + '-' + originalName,
stringedSoup: stringedsouplineOBJ
};
} else {
return {
filename: '',
stringedSoup: stringedsouplineOBJ
}
};
};
// take apart the rsscode line
// if there is a <soup:attributes> tag
// and inside that (JSON object) a url field
// start the downloading madness
// pass down the callback from the queue
function downloader(task, callback){
var soupline = task.line.toString("utf8").match("<soup:attributes>(.*?)</soup:attributes>");
var title = task.line.toString("utf8").match("<title>(.*?)</title>");
if (title != null){
lastTitle = title[1];
}
if (soupline != null) {
var souplineOBJ = JSON.parse(soupline[1]);
if (lastTitle){
souplineOBJ.title = lastTitle;
}
if ("url" in souplineOBJ){
souppiccounter++
var metadata = cleverMetadataGenerator(souplineOBJ);
var souplineURLArray = souplineOBJ.url.split("/");
// console.log(souplineOBJ.url.split("/"));
options = {
host: souplineURLArray[2],
port: 80,
path: '/' + souplineURLArray[3] + '/' + souplineURLArray[4] + '/' + souplineURLArray[5]
};
var fileext = souplineURLArray[5].split('.')[souplineURLArray[5].split('.').length-1]
//console.log(fileext);
var request = http.get(options, function(res){
var imagedata = '';
res.setEncoding('binary');
res.on('data', function(chunk){
imagedata += chunk;
});
res.on('end', function(){
//console.log("respnsoe ends", res.socket.parser._header)
//console.log(imagedata.toString())
//write the file
var fullpath = path + '/' + metadata.filename + '_' + souplineURLArray[5].split('.')[0];
//console.log(new Buffer(imagedata).toString('utf16le',1,1000));
fs.writeFile(fullpath + '.' + fileext, imagedata, 'binary', function(err){
if (err) throw err;
if (writeMeta = true) {
//write MetaData Text File
fs.writeFile(fullpath + '.txt', metadata.stringedSoup, 'utf8', function(err){
if (err) console.log(err.message);
console.log('File ' + fullpath + ' and Metadata saved.');
callback();
});
} else {
console.log('File ' + fullpath + ' saved.');
callback();
};
});
});
});
} else {
callback();
};
} else {
callback();
};
};
// initialize the process and download queue
var q = async.queue(function (task, callback) {
//console.log('Processing task: ' + task.id);
downloader(task, callback);
}, parralelDLs);
//push a new line into the queue to be processed
function processRssLine(line){
count++;
q.push({id: count.toString(), line: line}, function(err){
if (err) console.log("ERROR in Queue: ", err.message);
});
};
// if the queue gets empty resume the filestream
q.empty = function(){
console.log("NEW BATCH");
stream.resume();
};
// read in the file in chunks process asynchronous as much as possible
// pause after each chunk cause downloading needs to catch up
new lazy(stream)
.lines
.forEach(function(line){
stream.pause();
processRssLine(line.toString());
}
);