bes  Updated for version 3.20.6
HttpdDirScraper.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 //
3 // This file is part of httpd_catalog_module, A C++ module that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 //
6 // Copyright (c) 2018 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <stdlib.h> /* atol */
29 #include <ctype.h> /* isalpha and isdigit */
30 #include <time.h> /* mktime */
31 
32 #include <BESDebug.h>
33 #include <BESUtil.h>
34 #include <BESRegex.h>
35 #include <BESCatalogList.h>
36 #include <BESCatalogUtils.h>
37 #include <CatalogItem.h>
38 
39 #include "RemoteHttpResource.h"
40 #include "HttpdCatalogNames.h"
41 
42 #include "HttpdDirScraper.h"
43 
44 using namespace std;
45 using bes::CatalogItem;
46 
47 #define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
48 
49 namespace httpd_catalog {
50 
51 HttpdDirScraper::HttpdDirScraper()
52 {
53  // There was probably a better way to make this association but this worked.
54  d_months.insert(pair<string, int>(string("jan"), 0));
55  d_months.insert(pair<string, int>(string("feb"), 1));
56  d_months.insert(pair<string, int>(string("mar"), 2));
57  d_months.insert(pair<string, int>(string("apr"), 3));
58  d_months.insert(pair<string, int>(string("may"), 4));
59  d_months.insert(pair<string, int>(string("jun"), 5));
60  d_months.insert(pair<string, int>(string("jul"), 6));
61  d_months.insert(pair<string, int>(string("aug"), 7));
62  d_months.insert(pair<string, int>(string("sep"), 8));
63  d_months.insert(pair<string, int>(string("oct"), 9));
64  d_months.insert(pair<string, int>(string("nov"), 10));
65  d_months.insert(pair<string, int>(string("dec"), 11));
66 }
67 
68 /*
69  * @brief Converts an Apache httpd directory page "size" string (23K, 45M, 32G, etc)
70  * to an actual value, approximate though it may be.
71  */
72 long HttpdDirScraper::get_size_val(const string size_str) const
73 {
74  char scale_c = *size_str.rbegin();
75  long scale = 1;
76 
77  switch (scale_c) {
78  case 'K':
79  scale = 1e3;
80  break;
81  case 'M':
82  scale = 1e6;
83  break;
84  case 'G':
85  scale = 1e9;
86  break;
87  case 'T':
88  scale = 1e12;
89  break;
90  case 'P':
91  scale = 1e15;
92  break;
93  default:
94  scale = 1;
95  break;
96  }
97  BESDEBUG(MODULE, prolog << "scale: " << scale << endl);
98 
99  string result = size_str;
100  if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
101 
102  long size = atol(result.c_str());
103  BESDEBUG(MODULE, prolog << "raw size: " << size << endl);
104 
105  size *= scale;
106  BESDEBUG(MODULE, prolog << "scaled size: " << size << endl);
107  return size;
108 }
109 
113 string show_tm_struct(const tm tms)
114 {
115  stringstream ss;
116  ss << "tm_sec: " << tms.tm_sec << endl;
117  ss << "tm_min: " << tms.tm_min << endl;
118  ss << "tm_hour: " << tms.tm_hour << endl;
119  ss << "tm_mday: " << tms.tm_mday << endl;
120  ss << "tm_mon: " << tms.tm_mon << endl;
121  ss << "tm_year: " << tms.tm_year << endl;
122  ss << "tm_wday: " << tms.tm_wday << endl;
123  ss << "tm_yday: " << tms.tm_yday << endl;
124  ss << "tm_isdst: " << tms.tm_isdst << endl;
125  return ss.str();
126 }
127 
131 void zero_tm_struct(tm &tms)
132 {
133  tms.tm_sec = 0;
134  tms.tm_min = 0;
135  tms.tm_hour = 0;
136  tms.tm_mday = 1;
137  tms.tm_mon = 0;
138  tms.tm_year = 0;
139  tms.tm_wday = 0;
140  tms.tm_yday = 0;
141  tms.tm_isdst = 0;
142 }
143 
144 
145 string HttpdDirScraper::httpd_time_to_iso_8601(const string httpd_time) const
146 {
147  vector<string> tokens;
148  string delimiters = "- :";
149  BESUtil::tokenize(httpd_time, tokens, delimiters);
150 
151  BESDEBUG(MODULE, prolog << "Found " << tokens.size() << " tokens." << endl);
152  vector<string>::iterator it = tokens.begin();
153  int i = 0;
154  if (BESDebug::IsSet(MODULE)) {
155  while (it != tokens.end()) {
156  BESDEBUG(MODULE, prolog << " token["<< i++ << "]: "<< *it << endl);
157  it++;
158  }
159  }
160 
161  BESDEBUG(MODULE, prolog << "Second Field: "<< tokens[1] << endl);
162 
163  const char *second_field = tokens[1].c_str();
164  bool is_alpha = true;
165  for(unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166  is_alpha = isalpha(second_field[i]);
167  }
168  time_t theTime;
169  if(is_alpha){
170  BESDEBUG(MODULE, prolog << "Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171  theTime = parse_time_format_A(tokens);
172  }
173  else {
174  BESDEBUG(MODULE, prolog << "Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175  theTime = parse_time_format_B(tokens);
176  }
177  return BESUtil::get_time(theTime, false);
178 
179 }
180 
186 time_t HttpdDirScraper::parse_time_format_A(const vector<string> tokens) const
187 {
188  // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
189  struct tm tm;
190  zero_tm_struct(tm);
191 
192  if (tokens.size() > 2) {
193  std::istringstream(tokens[0]) >> tm.tm_mday;
194  BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
195 
196  pair<string, int> mnth = *d_months.find(BESUtil::lowercase(tokens[1]));
197  BESDEBUG(MODULE, prolog << " mnth.first: "<< mnth.first << endl);
198  BESDEBUG(MODULE, prolog << " mnth.second: "<< mnth.second << endl);
199  tm.tm_mon = mnth.second;
200  BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
201 
202  std::istringstream(tokens[2]) >> tm.tm_year;
203  tm.tm_year -= 1900;
204  BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
205 
206  if (tokens.size() > 4) {
207  std::istringstream(tokens[3]) >> tm.tm_hour;
208  BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
209  std::istringstream(tokens[4]) >> tm.tm_min;
210  BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
211  }
212  }
213 
214  BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
215 
216  time_t theTime = mktime(&tm);
217  BESDEBUG(MODULE, prolog << "theTime: " << theTime << endl);
218  return theTime;
219 }
220 
226 time_t HttpdDirScraper::parse_time_format_B(const vector<string> tokens) const
227 {
228  // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
229  struct tm tm;
230  zero_tm_struct(tm);
231 
232  if (tokens.size() > 2) {
233  std::istringstream(tokens[0]) >> tm.tm_year;
234  tm.tm_year -= 1900;
235  BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
236 
237  std::istringstream(tokens[1]) >> tm.tm_mon;
238  BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
239 
240  std::istringstream(tokens[2]) >> tm.tm_mday;
241  BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
242 
243  if (tokens.size() > 4) {
244  std::istringstream(tokens[3]) >> tm.tm_hour;
245  BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
246  std::istringstream(tokens[4]) >> tm.tm_min;
247  BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
248  }
249  }
250 
251  BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
252 
253  time_t theTime = mktime(&tm);
254  BESDEBUG(MODULE, prolog << "ISO-8601 Time: " << theTime << endl);
255  return theTime;
256 }
257 
274 void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items) const
275 {
276  const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
277 
278  // Go get the text from the remote resource
279  RemoteHttpResource rhr(url);
280  rhr.retrieveResource();
281  ifstream t(rhr.getCacheFileName().c_str());
282  stringstream buffer;
283  buffer << t.rdbuf();
284  string pageStr = buffer.str();
285 
286  string aOpenStr = "<a ";
287  string aCloseStr = "</a>";
288  string hrefStr = "href=\"";
289  string tdOpenStr = "<td ";
290  string tdCloseStr = "</td>";
291 
292  BESRegex hrefExcludeRegex("(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
293  BESRegex nameExcludeRegex("^Parent Directory$");
294 
295  bool done = false;
296  int next_start = 0;
297  while (!done) {
298  int aOpenIndex = pageStr.find(aOpenStr, next_start);
299  if (aOpenIndex < 0) {
300  done = true;
301  }
302  else {
303  int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
304  if (aCloseIndex < 0) {
305  done = true;
306  }
307  else {
308  int length;
309 
310  // Locate the entire <a /> element
311  BESDEBUG(MODULE, prolog << "aOpenIndex: " << aOpenIndex << endl);
312  BESDEBUG(MODULE, prolog << "aCloseIndex: " << aCloseIndex << endl);
313  length = aCloseIndex + aCloseStr.length() - aOpenIndex;
314  string aElemStr = pageStr.substr(aOpenIndex, length);
315  BESDEBUG(MODULE, prolog << "Processing link: " << aElemStr << endl);
316 
317  // Find the link text
318  int start = aElemStr.find(">") + 1;
319  int end = aElemStr.find("<", start);
320  length = end - start;
321  string linkText = aElemStr.substr(start, length);
322  BESDEBUG(MODULE, prolog << "Link Text: " << linkText << endl);
323 
324  // Locate the href attribute
325  start = aElemStr.find(hrefStr) + hrefStr.length();
326  end = aElemStr.find("\"", start);
327  length = end - start;
328  string href = aElemStr.substr(start, length);
329  BESDEBUG(MODULE, prolog << "href: " << href << endl);
330 
331  // attempt to get time string
332  string time_str;
333  int start_pos = getNextElementText(pageStr, "td", aCloseIndex + aCloseStr.length(), time_str);
334  BESDEBUG(MODULE, prolog << "time_str: '" << time_str << "'" << endl);
335 
336  // attempt to get size string
337  string size_str;
338  start_pos = getNextElementText(pageStr, "td", start_pos, size_str);
339  BESDEBUG(MODULE, prolog << "size_str: '" << size_str << "'" << endl);
340 
341  if ((linkText.find("<img") != string::npos) || !(linkText.length()) || (linkText.find("<<<") != string::npos)
342  || (linkText.find(">>>") != string::npos)) {
343  BESDEBUG(MODULE, prolog << "SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
344  }
345  else {
346  if (href.length() == 0 || (((href.find("http://") == 0) || (href.find("https://") == 0)) && !(href.find(url) == 0))) {
347  // SKIPPING
348  BESDEBUG(MODULE, prolog << "SKIPPING(null or remote): " << href << endl);
349  }
350  else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
351  // SKIPPING
352  BESDEBUG(MODULE, prolog << "SKIPPING(hrefExcludeRegex) - href: '" << href << "'"<< endl);
353  }
354  else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
355  // SKIPPING
356  BESDEBUG(MODULE, prolog << "SKIPPING(nameExcludeRegex) - name: '" << linkText << "'" << endl);
357  }
358  else if (BESUtil::endsWith(href, "/")) {
359  string node_name = href.substr(0, href.length() - 1);
360  // it's a directory aka a node
361  BESDEBUG(MODULE, prolog << "NODE: " << node_name << endl);
362  bes::CatalogItem *childNode = new bes::CatalogItem();
363  childNode->set_type(CatalogItem::node);
364  childNode->set_name(node_name);
365  childNode->set_is_data(false);
366  string iso_8601_time = httpd_time_to_iso_8601(time_str);
367  childNode->set_lmt(iso_8601_time);
368  // FIXME: For nodes the size should be the number of children, but how without crawling?
369  long size = get_size_val(size_str);
370  childNode->set_size(size);
371 
372  items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
373  }
374  else {
375  // It's a file aka a leaf
376  BESDEBUG(MODULE, prolog << "LEAF: " << href << endl);
377  CatalogItem *leafItem = new CatalogItem();
378  leafItem->set_type(CatalogItem::leaf);
379  leafItem->set_name(href);
380  leafItem->set_is_data(cat_utils->is_data(href));
381  string iso_8601_time = httpd_time_to_iso_8601(time_str);
382  leafItem->set_lmt(iso_8601_time);
383  long size = get_size_val(size_str);
384  leafItem->set_size(size);
385 
386  items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
387  }
388  }
389  }
390  next_start = aCloseIndex + aCloseStr.length();
391  }
392  }
393 }
394 
407 int HttpdDirScraper::getNextElementText(const string &page_str, string element_name, int startIndex, string &resultText, bool trim) const
408 {
409  string e_open_str = "<" + element_name + " ";
410  string e_close_str = "</" + element_name + ">";
411 
412  // Locate the next "element_name" element
413  int start = page_str.find(e_open_str, startIndex);
414  int end = page_str.find(e_close_str, start + e_open_str.length());
415  int length = end + e_close_str.length() - start;
416  string element_str = page_str.substr(start, length);
417 
418  // Find the text
419  start = element_str.find(">") + 1;
420  end = element_str.find("<", start);
421  length = end - start;
422  resultText = element_str.substr(start, length);
423 
424  if (trim) BESUtil::removeLeadingAndTrailingBlanks(resultText);
425 
426  BESDEBUG(MODULE, prolog << "resultText: '" << resultText << "'" << endl);
427  return startIndex + element_str.length();
428 }
429 
430 /*
431  * @brief Returns the catalog node represented by the httpd directory page returned
432  * by dereferencing the passed url.
433  * @param url The url of the Apache httpd directory to process.
434  * @param path The path prefix that associates the location of this generated CatalogNode with it's
435  * correct position in the local service path.
436  */
437 bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
438 {
439  BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
440  bes::CatalogNode *node = new bes::CatalogNode(path);
441 
442  if (BESUtil::endsWith(url, "/")) {
443  // This always means the URL points to a node when coming from httpd
444  map<string, bes::CatalogItem *> items;
445  createHttpdDirectoryPageMap(url, items);
446 
447  BESDEBUG(MODULE, prolog << "Found " << items.size() << " items." << endl);
448  map<string, bes::CatalogItem *>::iterator it;
449  it = items.begin();
450  while (it != items.end()) {
451  bes::CatalogItem *item = it->second;
452  BESDEBUG(MODULE, prolog << "Adding item: '" << item->get_name() << "'"<< endl);
453  if (item->get_type() == CatalogItem::node)
454  node->add_node(item);
455  else
456  node->add_leaf(item);
457  it++;
458  }
459  }
460  else {
461  // It's a leaf aka "item" response.
462  const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
463  std::vector<std::string> url_parts = BESUtil::split(url, '/', true);
464  string leaf_name = url_parts.back();
465 
466  CatalogItem *item = new CatalogItem();
467  item->set_type(CatalogItem::leaf);
468  item->set_name(leaf_name);
469  item->set_is_data(cat_utils->is_data(leaf_name));
470 
471  // FIXME: Find the Last Modified date? Head??
472  item->set_lmt(BESUtil::get_time(true));
473 
474  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
475  item->set_size(1);
476 
477  node->set_leaf(item);
478  }
479  return node;
480 }
481 
482 #if 0
483 
484 bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
485 {
486  BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
487  bes::CatalogNode *node = new bes::CatalogNode(path);
488 
489  if (BESUtil::endsWith(url, "/")) {
490 
491  set<string> pageNodes;
492  set<string> pageLeaves;
493  createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
494 
495  BESDEBUG(MODULE, prolog << "Found " << pageNodes.size() << " nodes." << endl);
496  BESDEBUG(MODULE, prolog << "Found " << pageLeaves.size() << " leaves." << endl);
497 
498  set<string>::iterator it;
499 
500  it = pageNodes.begin();
501  while (it != pageNodes.end()) {
502  string pageNode = *it;
503  if (BESUtil::endsWith(pageNode, "/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
504 
505  bes::CatalogItem *childNode = new bes::CatalogItem();
506  childNode->set_type(CatalogItem::node);
507 
508  childNode->set_name(pageNode);
509  childNode->set_is_data(false);
510 
511  // FIXME: Figure out the LMT if we can... HEAD?
512  childNode->set_lmt(BESUtil::get_time(true));
513 
514  // FIXME: For nodes the size should be the number of children, but how without crawling?
515  childNode->set_size(0);
516 
517  node->add_node(childNode);
518  it++;
519  }
520 
521  it = pageLeaves.begin();
522  while (it != pageLeaves.end()) {
523  string leaf = *it;
524  CatalogItem *leafItem = new CatalogItem();
525  leafItem->set_type(CatalogItem::leaf);
526  leafItem->set_name(leaf);
527 
528  // FIXME: wrangle up the Typematch and see if we think this thing is data or not.
529  leafItem->set_is_data(false);
530 
531  // FIXME: Find the Last Modified date?
532  leafItem->set_lmt(BESUtil::get_time(true));
533 
534  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
535  leafItem->set_size(1);
536 
537  node->add_leaf(leafItem);
538  it++;
539  }
540  }
541  else {
542  std::vector<std::string> url_parts = BESUtil::split(url,'/',true);
543  string leaf_name = url_parts.back();
544 
545  CatalogItem *item = new CatalogItem();
546  item->set_type(CatalogItem::leaf);
547  item->set_name(leaf_name);
548  // FIXME: Find the Last Modified date?
549  item->set_lmt(BESUtil::get_time(true));
550 
551  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
552  item->set_size(1);
553 
554  node->set_leaf(item);
555 
556  }
557  return node;
558 
559 }
560 #endif
561 
562 }
563  // namespace httpd_catalog
564 
BESCatalogUtils::is_data
bool is_data(const std::string &item) const
is there a handler that can process this
Definition: BESCatalogUtils.cc:447
BESUtil::tokenize
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition: BESUtil.cc:1057
bes::CatalogItem::set_size
void set_size(size_t s)
Set the size of the item.
Definition: CatalogItem.h:140
BESCatalogUtils
Definition: BESCatalogUtils.h:61
BESUtil::get_time
static std::string get_time(bool use_local_time=false)
Definition: BESUtil.cc:1079
bes::CatalogItem::get_type
item_type get_type() const
Get the type of this item (unknown, node or leaf)
Definition: CatalogItem.h:153
BESCatalogList::TheCatalogList
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
Definition: BESCatalogList.cc:81
bes::CatalogItem::set_name
void set_name(std::string n)
Set the name of the item.
Definition: CatalogItem.h:135
BESDebug::IsSet
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:157
bes::CatalogItem::set_is_data
void set_is_data(bool id)
Is this item data that the BES should interpret?
Definition: CatalogItem.h:150
bes::CatalogNode
Definition: CatalogNode.h:45
bes::CatalogItem
Definition: CatalogItem.h:72
BESUtil::endsWith
static bool endsWith(std::string const &fullString, std::string const &ending)
Definition: BESUtil.cc:942
BESCatalog::get_catalog_utils
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
Definition: BESCatalog.h:113
bes::CatalogItem::get_name
std::string get_name() const
The name of this item in the node.
Definition: CatalogItem.h:133
bes::CatalogItem::set_type
void set_type(item_type t)
Set the type for this item.
Definition: CatalogItem.h:155
bes::CatalogItem::set_lmt
void set_lmt(std::string lmt)
Set the LMT for this item.
Definition: CatalogItem.h:145
BESUtil::split
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
Definition: BESUtil.cc:1125
BESUtil::lowercase
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:200
BESRegex
Definition: BESRegex.h:41
BESUtil::removeLeadingAndTrailingBlanks
static void removeLeadingAndTrailingBlanks(std::string &key)
Definition: BESUtil.cc:466