35 #include <BESCatalogList.h>
36 #include <BESCatalogUtils.h>
37 #include <CatalogItem.h>
39 #include "RemoteHttpResource.h"
40 #include "HttpdCatalogNames.h"
42 #include "HttpdDirScraper.h"
47 #define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
49 namespace httpd_catalog {
51 HttpdDirScraper::HttpdDirScraper()
54 d_months.insert(pair<string, int>(
string(
"jan"), 0));
55 d_months.insert(pair<string, int>(
string(
"feb"), 1));
56 d_months.insert(pair<string, int>(
string(
"mar"), 2));
57 d_months.insert(pair<string, int>(
string(
"apr"), 3));
58 d_months.insert(pair<string, int>(
string(
"may"), 4));
59 d_months.insert(pair<string, int>(
string(
"jun"), 5));
60 d_months.insert(pair<string, int>(
string(
"jul"), 6));
61 d_months.insert(pair<string, int>(
string(
"aug"), 7));
62 d_months.insert(pair<string, int>(
string(
"sep"), 8));
63 d_months.insert(pair<string, int>(
string(
"oct"), 9));
64 d_months.insert(pair<string, int>(
string(
"nov"), 10));
65 d_months.insert(pair<string, int>(
string(
"dec"), 11));
72 long HttpdDirScraper::get_size_val(
const string size_str)
const
74 char scale_c = *size_str.rbegin();
97 BESDEBUG(MODULE, prolog <<
"scale: " << scale << endl);
99 string result = size_str;
100 if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
102 long size = atol(result.c_str());
103 BESDEBUG(MODULE, prolog <<
"raw size: " << size << endl);
106 BESDEBUG(MODULE, prolog <<
"scaled size: " << size << endl);
113 string show_tm_struct(
const tm tms)
116 ss <<
"tm_sec: " << tms.tm_sec << endl;
117 ss <<
"tm_min: " << tms.tm_min << endl;
118 ss <<
"tm_hour: " << tms.tm_hour << endl;
119 ss <<
"tm_mday: " << tms.tm_mday << endl;
120 ss <<
"tm_mon: " << tms.tm_mon << endl;
121 ss <<
"tm_year: " << tms.tm_year << endl;
122 ss <<
"tm_wday: " << tms.tm_wday << endl;
123 ss <<
"tm_yday: " << tms.tm_yday << endl;
124 ss <<
"tm_isdst: " << tms.tm_isdst << endl;
131 void zero_tm_struct(tm &tms)
145 string HttpdDirScraper::httpd_time_to_iso_8601(
const string httpd_time)
const
147 vector<string> tokens;
148 string delimiters =
"- :";
151 BESDEBUG(MODULE, prolog <<
"Found " << tokens.size() <<
" tokens." << endl);
152 vector<string>::iterator it = tokens.begin();
155 while (it != tokens.end()) {
156 BESDEBUG(MODULE, prolog <<
" token["<< i++ <<
"]: "<< *it << endl);
161 BESDEBUG(MODULE, prolog <<
"Second Field: "<< tokens[1] << endl);
163 const char *second_field = tokens[1].c_str();
164 bool is_alpha =
true;
165 for(
unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166 is_alpha = isalpha(second_field[i]);
170 BESDEBUG(MODULE, prolog <<
"Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171 theTime = parse_time_format_A(tokens);
174 BESDEBUG(MODULE, prolog <<
"Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175 theTime = parse_time_format_B(tokens);
186 time_t HttpdDirScraper::parse_time_format_A(
const vector<string> tokens)
const
192 if (tokens.size() > 2) {
193 std::istringstream(tokens[0]) >> tm.tm_mday;
194 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
197 BESDEBUG(MODULE, prolog <<
" mnth.first: "<< mnth.first << endl);
198 BESDEBUG(MODULE, prolog <<
" mnth.second: "<< mnth.second << endl);
199 tm.tm_mon = mnth.second;
200 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
202 std::istringstream(tokens[2]) >> tm.tm_year;
204 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
206 if (tokens.size() > 4) {
207 std::istringstream(tokens[3]) >> tm.tm_hour;
208 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
209 std::istringstream(tokens[4]) >> tm.tm_min;
210 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
214 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
216 time_t theTime = mktime(&tm);
217 BESDEBUG(MODULE, prolog <<
"theTime: " << theTime << endl);
226 time_t HttpdDirScraper::parse_time_format_B(
const vector<string> tokens)
const
232 if (tokens.size() > 2) {
233 std::istringstream(tokens[0]) >> tm.tm_year;
235 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
237 std::istringstream(tokens[1]) >> tm.tm_mon;
238 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
240 std::istringstream(tokens[2]) >> tm.tm_mday;
241 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
243 if (tokens.size() > 4) {
244 std::istringstream(tokens[3]) >> tm.tm_hour;
245 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
246 std::istringstream(tokens[4]) >> tm.tm_min;
247 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
251 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
253 time_t theTime = mktime(&tm);
254 BESDEBUG(MODULE, prolog <<
"ISO-8601 Time: " << theTime << endl);
274 void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items)
const
279 RemoteHttpResource rhr(url);
280 rhr.retrieveResource();
281 ifstream t(rhr.getCacheFileName().c_str());
284 string pageStr = buffer.str();
286 string aOpenStr =
"<a ";
287 string aCloseStr =
"</a>";
288 string hrefStr =
"href=\"";
289 string tdOpenStr =
"<td ";
290 string tdCloseStr =
"</td>";
292 BESRegex hrefExcludeRegex(
"(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
293 BESRegex nameExcludeRegex(
"^Parent Directory$");
298 int aOpenIndex = pageStr.find(aOpenStr, next_start);
299 if (aOpenIndex < 0) {
303 int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
304 if (aCloseIndex < 0) {
311 BESDEBUG(MODULE, prolog <<
"aOpenIndex: " << aOpenIndex << endl);
312 BESDEBUG(MODULE, prolog <<
"aCloseIndex: " << aCloseIndex << endl);
313 length = aCloseIndex + aCloseStr.length() - aOpenIndex;
314 string aElemStr = pageStr.substr(aOpenIndex, length);
315 BESDEBUG(MODULE, prolog <<
"Processing link: " << aElemStr << endl);
318 int start = aElemStr.find(
">") + 1;
319 int end = aElemStr.find(
"<", start);
320 length = end - start;
321 string linkText = aElemStr.substr(start, length);
322 BESDEBUG(MODULE, prolog <<
"Link Text: " << linkText << endl);
325 start = aElemStr.find(hrefStr) + hrefStr.length();
326 end = aElemStr.find(
"\"", start);
327 length = end - start;
328 string href = aElemStr.substr(start, length);
329 BESDEBUG(MODULE, prolog <<
"href: " << href << endl);
333 int start_pos = getNextElementText(pageStr,
"td", aCloseIndex + aCloseStr.length(), time_str);
334 BESDEBUG(MODULE, prolog <<
"time_str: '" << time_str <<
"'" << endl);
338 start_pos = getNextElementText(pageStr,
"td", start_pos, size_str);
339 BESDEBUG(MODULE, prolog <<
"size_str: '" << size_str <<
"'" << endl);
341 if ((linkText.find(
"<img") != string::npos) || !(linkText.length()) || (linkText.find(
"<<<") != string::npos)
342 || (linkText.find(
">>>") != string::npos)) {
343 BESDEBUG(MODULE, prolog <<
"SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
346 if (href.length() == 0 || (((href.find(
"http://") == 0) || (href.find(
"https://") == 0)) && !(href.find(url) == 0))) {
348 BESDEBUG(MODULE, prolog <<
"SKIPPING(null or remote): " << href << endl);
350 else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
352 BESDEBUG(MODULE, prolog <<
"SKIPPING(hrefExcludeRegex) - href: '" << href <<
"'"<< endl);
354 else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
356 BESDEBUG(MODULE, prolog <<
"SKIPPING(nameExcludeRegex) - name: '" << linkText <<
"'" << endl);
359 string node_name = href.substr(0, href.length() - 1);
361 BESDEBUG(MODULE, prolog <<
"NODE: " << node_name << endl);
363 childNode->
set_type(CatalogItem::node);
366 string iso_8601_time = httpd_time_to_iso_8601(time_str);
367 childNode->
set_lmt(iso_8601_time);
369 long size = get_size_val(size_str);
372 items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
376 BESDEBUG(MODULE, prolog <<
"LEAF: " << href << endl);
378 leafItem->
set_type(CatalogItem::leaf);
381 string iso_8601_time = httpd_time_to_iso_8601(time_str);
382 leafItem->
set_lmt(iso_8601_time);
383 long size = get_size_val(size_str);
386 items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
390 next_start = aCloseIndex + aCloseStr.length();
407 int HttpdDirScraper::getNextElementText(
const string &page_str,
string element_name,
int startIndex,
string &resultText,
bool trim)
const
409 string e_open_str =
"<" + element_name +
" ";
410 string e_close_str =
"</" + element_name +
">";
413 int start = page_str.find(e_open_str, startIndex);
414 int end = page_str.find(e_close_str, start + e_open_str.length());
415 int length = end + e_close_str.length() - start;
416 string element_str = page_str.substr(start, length);
419 start = element_str.find(
">") + 1;
420 end = element_str.find(
"<", start);
421 length = end - start;
422 resultText = element_str.substr(start, length);
426 BESDEBUG(MODULE, prolog <<
"resultText: '" << resultText <<
"'" << endl);
427 return startIndex + element_str.length();
437 bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
439 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
444 map<string, bes::CatalogItem *> items;
445 createHttpdDirectoryPageMap(url, items);
447 BESDEBUG(MODULE, prolog <<
"Found " << items.size() <<
" items." << endl);
448 map<string, bes::CatalogItem *>::iterator it;
450 while (it != items.end()) {
452 BESDEBUG(MODULE, prolog <<
"Adding item: '" << item->
get_name() <<
"'"<< endl);
453 if (item->
get_type() == CatalogItem::node)
454 node->add_node(item);
456 node->add_leaf(item);
463 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
464 string leaf_name = url_parts.back();
477 node->set_leaf(item);
484 bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
486 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
491 set<string> pageNodes;
492 set<string> pageLeaves;
493 createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
495 BESDEBUG(MODULE, prolog <<
"Found " << pageNodes.size() <<
" nodes." << endl);
496 BESDEBUG(MODULE, prolog <<
"Found " << pageLeaves.size() <<
" leaves." << endl);
498 set<string>::iterator it;
500 it = pageNodes.begin();
501 while (it != pageNodes.end()) {
502 string pageNode = *it;
503 if (
BESUtil::endsWith(pageNode,
"/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
506 childNode->
set_type(CatalogItem::node);
517 node->add_node(childNode);
521 it = pageLeaves.begin();
522 while (it != pageLeaves.end()) {
525 leafItem->
set_type(CatalogItem::leaf);
537 node->add_leaf(leafItem);
542 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
543 string leaf_name = url_parts.back();
554 node->set_leaf(item);