bes  Updated for version 3.20.6
httpd_catalog_module/RemoteHttpResource.cc
1 
2 // -*- mode: c++; c-basic-offset:4 -*-
3 
4 // This file is part of httpd_catalog_module, A C++ MODULE that can be loaded in to
5 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
6 
7 // Copyright (c) 2018 OPeNDAP, Inc.
8 // Author: Nathan Potter <ndp@opendap.org>
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 //
24 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
25 
26 // Authors:
27 // ndp Nathan Potter <ndp@opendap.org>
28 
29 #include "config.h"
30 
31 #include <sstream>
32 #include <fstream>
33 #include <string>
34 #include <iostream>
35 
36 #include <BESDebug.h>
37 #include <BESUtil.h>
38 #include <BESInternalError.h>
39 
40 #include "curl_utils.h"
41 #include "HttpdCatalogNames.h"
42 #include "HttpdCatalogUtils.h"
43 #include "RemoteHttpResource.h"
44 #include "RemoteHttpResourceCache.h"
45 
46 using namespace std;
47 
48 #define prolog string("RemoteHttpResource::").append(__func__).append("() - ")
49 
50 namespace httpd_catalog {
51 
57 RemoteHttpResource::RemoteHttpResource(const string &const_url)
58 {
59  d_initialized = false;
60  d_fd = 0;
61  d_curl = 0;
62  d_resourceCacheFileName.clear();
63  d_response_headers = new vector<string>();
64  d_request_headers = new vector<string>();
65  d_http_response_headers = new map<string, string>();
66 
67  BESDEBUG(MODULE, prolog << "Passed url: " << const_url << endl);
68 
69  string url = const_url;
70  if (url.empty()) {
71  string err = "RemoteHttpResource(): Remote resource URL is empty";
72  throw BESInternalError(err, __FILE__, __LINE__);
73  }
74 
75  size_t file_index = url.find("file://");
76  if( file_index!=url.npos && file_index==0 && *url.rbegin()=='/'){
77  url = url.substr(0,url.length()-1);
78  }
79 
80  d_remoteResourceUrl = url;
81 
82  BESDEBUG(MODULE, prolog << "URL: " << d_remoteResourceUrl << endl);
83 
84  // EXAMPLE: returned value parameter for CURL *
85  //
86  // CURL *www_lib_init(CURL **curl); // function type signature
87  //
88  // CURL *pvparam = 0; // passed value parameter
89  // result = www_lib_init(&pvparam); // the call to the method
90 
91  d_curl = init(d_error_buffer); // This may throw either Error or InternalErr
92 
93  configureProxy(d_curl, d_remoteResourceUrl); // Configure the a proxy for this url (if appropriate).
94 
95  BESDEBUG(MODULE, prolog << "d_curl: " << d_curl << endl);
96 }
97 RemoteHttpResource::~RemoteHttpResource()
102 {
103  BESDEBUG(MODULE, prolog << "BEGIN resourceURL: " << d_remoteResourceUrl << endl);
104 
105  delete d_response_headers;
106  d_response_headers = 0;
107  BESDEBUG(MODULE, prolog << "Deleted d_response_headers." << endl);
108 
109  delete d_request_headers;
110  d_request_headers = 0;
111  BESDEBUG(MODULE, prolog << "Deleted d_request_headers." << endl);
112 
113  if (!d_resourceCacheFileName.empty()) {
114  RemoteHttpResourceCache *cache = RemoteHttpResourceCache::get_instance();
115  if (cache) {
116  cache->unlock_and_close(d_resourceCacheFileName);
117  BESDEBUG(MODULE, prolog << "Closed and unlocked "<< d_resourceCacheFileName << endl);
118  d_resourceCacheFileName.clear();
119  }
120  }
121 
122  if (d_curl) {
123  curl_easy_cleanup(d_curl);
124  BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << endl);
125  }
126  d_curl = 0;
127 
128  BESDEBUG(MODULE, prolog << "END resourceURL: " << d_remoteResourceUrl << endl);
129  d_remoteResourceUrl.clear();
130 }
131 
139 void RemoteHttpResource::retrieveResource()
140 {
141  BESDEBUG(MODULE, prolog << "BEGIN resourceURL: " << d_remoteResourceUrl << endl);
142 
143  if (d_initialized) {
144  BESDEBUG(MODULE, prolog << "END Already initialized." << endl);
145  return;
146  }
147 
148  // Get a pointer to the singleton cache instance for this process.
149  RemoteHttpResourceCache *cache = RemoteHttpResourceCache::get_instance();
150  if (!cache) {
151  ostringstream oss;
152  oss << __func__ << "() - FAILED to get local cache."
153  " Unable to proceed with request for " << this->d_remoteResourceUrl << " The httpd_catalog MUST have a valid cache configuration to operate."
154  << endl;
155  BESDEBUG(MODULE, oss.str());
156  throw BESInternalError(oss.str(), __FILE__, __LINE__);
157  }
158 
159  // Get the name of the file in the cache (either the code finds this file or
160  // or it makes it).
161  d_resourceCacheFileName = cache->get_cache_file_name(d_remoteResourceUrl);
162  BESDEBUG(MODULE, prolog << "d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
163 
164  // @TODO MAKE THIS RETRIEVE THE CACHED DATA TYPE IF THE CACHED RESPONSE IF FOUND
165  // We need to know the type of the resource. HTTP headers are the preferred way to determine the type.
166  // Unfortunately, the current code losses both the HTTP headers sent from the request and the derived type
167  // to subsequent accesses of the cached object. Since we have to have a type, for now we just set the type
168  // from the url. If down below we DO an HTTP GET then the headers will be evaluated and the type set by setType()
169  // But really - we gotta fix this.
170  HttpdCatalogUtils::get_type_from_url(d_remoteResourceUrl, d_type);
171  BESDEBUG(MODULE, prolog << "d_type: " << d_type << endl);
172 
173  try {
174  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
175  BESDEBUG(MODULE, prolog << "Remote resource is already in cache. cache_file_name: " << d_resourceCacheFileName << endl);
176 
177  // #########################################################################################################
178  // I think in this if() is where we need to load the headers from the cache if we have them.
179  string hdr_filename = cache->get_cache_file_name(d_remoteResourceUrl) + ".hdrs";
180  ifstream hdr_ifs(hdr_filename.c_str());
181  try {
182  BESDEBUG(MODULE, prolog << "Reading response headers from: " << hdr_filename << endl);
183  for (string line; getline(hdr_ifs, line);) {
184  (*d_response_headers).push_back(line);
185  BESDEBUG(MODULE, prolog << "header: " << line << endl);
186  }
187  }
188  catch (...) {
189  hdr_ifs.close();
190  throw;
191  }
192  ingest_http_headers_and_type();
193  d_initialized = true;
194  return;
195  // #########################################################################################################
196  }
197 
198  // Now we actually need to reach out across the interwebs and retrieve the remote resource and put it's
199  // content into a local cache file, given that it's not in the cache.
200  // First make an empty file and get an exclusive lock on it.
201  if (cache->create_and_lock(d_resourceCacheFileName, d_fd)) {
202 
203  // Write the remote resource to the cache file.
204  try {
205  writeResourceToFile(d_fd);
206  }
207  catch(...){
208  // If things went south then we need to dump the file because we'll end up with an empty/bogus file clogging the cache
209  unlink(d_resourceCacheFileName.c_str());
210  throw;
211  }
212 
213  // #########################################################################################################
214  // I think right here is where I would be able to cache the data type/response headers. While I have
215  // the exclusive lock I could open another cache file for metadata and write to it.
216  {
217  string hdr_filename = cache->get_cache_file_name(d_remoteResourceUrl) + ".hdrs";
218  ofstream hdr_out(hdr_filename.c_str());
219  try {
220  for (size_t i = 0; i < this->d_response_headers->size(); i++) {
221  hdr_out << (*d_response_headers)[i] << endl;
222  }
223  }
224  catch (...) {
225  // If this fails for any reason we:
226  hdr_out.close(); // Close the stream
227  unlink(hdr_filename.c_str()); // unlink the file
228  unlink(d_resourceCacheFileName.c_str()); // unlink the primary cache file.
229  throw;
230  }
231  }
232  // #########################################################################################################
233 
234  // Change the exclusive lock on the new file to a shared lock. This keeps
235  // other processes from purging the new file and ensures that the reading
236  // process can use it.
237  cache->exclusive_to_shared_lock(d_fd);
238  BESDEBUG(MODULE, prolog << "Converted exclusive cache lock to shared lock." << endl);
239 
240  // Now update the total cache size info and purge if needed. The new file's
241  // name is passed into the purge method because this process cannot detect its
242  // own lock on the file.
243  unsigned long long size = cache->update_cache_info(d_resourceCacheFileName);
244  BESDEBUG(MODULE, prolog << "Updated cache info" << endl);
245 
246  if (cache->cache_too_big(size)) {
247  cache->update_and_purge(d_resourceCacheFileName);
248  BESDEBUG(MODULE, prolog << "Updated and purged cache." << endl);
249  }
250  BESDEBUG(MODULE, prolog << "END" << endl);
251  d_initialized = true;
252  return;
253  }
254  else {
255  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
256  BESDEBUG(MODULE, prolog << "Remote resource is in cache. cache_file_name: " << d_resourceCacheFileName << endl);
257  d_initialized = true;
258  return;
259  }
260  }
261 
262  string msg = prolog + "Failed to acquire cache read lock for remote resource: '";
263  msg += d_remoteResourceUrl + "\n";
264  throw BESInternalError(msg, __FILE__, __LINE__);
265 #if 0
266  throw libdap::Error(msg);
267 #endif
268  }
269  catch (...) {
270  BESDEBUG(MODULE, "RemoteHttpResource::retrieveResource() - Caught exception, unlocking cache and re-throw." << endl);
271  cache->unlock_cache();
272  throw;
273  }
274 }
275 
284 void RemoteHttpResource::writeResourceToFile(int fd)
285 {
286  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
287 
288  int status = -1;
289 
290 #if 0
291  try {
292 #endif
293 
294  BESDEBUG(MODULE,
295  "RemoteHttpResource::writeResourceToFile() - Saving resource " << d_remoteResourceUrl << " to cache file " << d_resourceCacheFileName << endl);
296 
297  status = read_url(d_curl, d_remoteResourceUrl, fd, d_response_headers, d_request_headers, d_error_buffer); // Throws Error.
298 
299  if (status >= 400) {
300  BESDEBUG(MODULE, prolog << "HTTP returned an error status: " << status << endl);
301  ostringstream oss;
302  oss << "Error while reading the URL: '";
303  oss << d_remoteResourceUrl;
304  oss << "' The HTTP request returned a status of " << status << " which means '";
305  oss << http_status_to_string(status) << "' \n";
306  throw BESInternalError(oss.str(), __FILE__, __LINE__);
307  }
308 
309  BESDEBUG(MODULE, prolog << "Resource " << d_remoteResourceUrl << " saved to cache file " << d_resourceCacheFileName << endl);
310 
311  // rewind the file
312 
313  // FIXME I think the idea here is that we have the file open and we should just keep
314  // reading from it. But the container mechanism works with file names, so we will
315  // likely have to open the file again. If that's true, lets remove this call. jhrg 3.2.18
316 
317  status = lseek(fd, 0, SEEK_SET);
318  if (-1 == status) throw BESError("Could not seek within the response.", BES_NOT_FOUND_ERROR, __FILE__, __LINE__);
319 
320  BESDEBUG(MODULE, prolog << "Reset file descriptor." << endl);
321 
322  ingest_http_headers_and_type();
323 
324 #if 0
325 }
326 catch (libdap::Error &e) {
327  throw;
328 }
329 #endif
330 
331  BESDEBUG(MODULE, prolog << "END" << endl);
332 }
333 
334 void RemoteHttpResource::ingest_http_headers_and_type()
335 {
336  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
337 
338  const string colon_space = ": ";
339  for (size_t i = 0; i < this->d_response_headers->size(); i++) {
340  size_t colon_index = (*d_response_headers)[i].find(colon_space);
341  string key = BESUtil::lowercase((*d_response_headers)[i].substr(0, colon_index));
342  string value = (*d_response_headers)[i].substr(colon_index + colon_space.length());
343  BESDEBUG(MODULE, prolog << "key: " << key << " value: " << value << endl);
344  (*d_http_response_headers)[key] = value;
345  }
346 
347  // Try and figure out the file type first from the
348  // Content-Disposition in the http header response.
349  string cdisp_hdr;
350  string ctype_hdr;
351  map<string, string>::iterator it;
352 
353  it = d_http_response_headers->find("content-disposition");
354  if (it != d_http_response_headers->end()) {
355  cdisp_hdr = it->second;
356  }
357 
358  it = d_http_response_headers->find("content-type");
359  if (it != d_http_response_headers->end()) {
360  ctype_hdr = it->second;
361  }
362 
363  string type;
364 
365  if (!cdisp_hdr.empty()) {
366  // Content disposition exists, grab the filename
367  // attribute
368  HttpdCatalogUtils::get_type_from_disposition(cdisp_hdr, type);
369  BESDEBUG(MODULE, prolog << "Evaluated content-disposition '" << cdisp_hdr << "' matched type: \"" << type << "\"" << endl);
370  }
371 
372  // still haven't figured out the type. Check the content-type
373  // next, translate to the BES MODULE name. It's also possible
374  // that even though Content-disposition was available, we could
375  // not determine the type of the file.
376  if (type.empty() && !ctype_hdr.empty()) {
377  HttpdCatalogUtils::get_type_from_content_type(ctype_hdr, type);
378  BESDEBUG(MODULE, prolog << "Evaluated content-type '" << ctype_hdr << "' matched type \"" << type << "\"" << endl);
379  }
380 
381  // still haven't figured out the type. Now check the actual URL
382  // and see if we can't match the URL to a MODULE name
383  if (type.empty()) {
384  HttpdCatalogUtils::get_type_from_url(d_remoteResourceUrl, type);
385  BESDEBUG(MODULE, prolog << "Evaluated url '" << d_remoteResourceUrl << "' matched type: \"" << type << "\"" << endl);
386  }
387 
388  // still couldn't figure it out, punt
389  if (type.empty()) {
390  string err = prolog + "Unable to determine the type of data" + " returned from '" + d_remoteResourceUrl + "' Setting type to 'unknown'";
391  BESDEBUG(MODULE, err << endl);
392  type = "unknown";
393  //throw BESSyntaxUserError( err, __FILE__, __LINE__ ) ;
394  }
395 
396  d_type = type;
397 
398  BESDEBUG(MODULE, prolog << "END (dataset type: "<< d_type << ")" << endl);
399 }
400 
406 string RemoteHttpResource::get_http_response_header(const string header_name)
407 {
408  string value("");
409  map<string, string>::iterator it;
410  it = d_http_response_headers->find(BESUtil::lowercase(header_name));
411  if (it != d_http_response_headers->end()) value = it->second;
412  return value;
413 }
414 
415 }
BESFileLockingCache::get_read_lock
virtual bool get_read_lock(const std::string &target, int &fd)
Get a read-only lock on the file if it exists.
Definition: BESFileLockingCache.cc:544
BESFileLockingCache::create_and_lock
virtual bool create_and_lock(const std::string &target, int &fd)
Create a file in the cache and lock it for write access.
Definition: BESFileLockingCache.cc:599
httpd_catalog::RemoteHttpResourceCache
A cache for content accessed via HTTP.
Definition: RemoteHttpResourceCache.h:50
BESFileLockingCache::unlock_and_close
virtual void unlock_and_close(const std::string &target)
Definition: BESFileLockingCache.cc:713
BESInternalError
exception thrown if internal error encountered
Definition: BESInternalError.h:43
BESFileLockingCache::update_cache_info
virtual unsigned long long update_cache_info(const std::string &target)
Update the cache info file to include 'target'.
Definition: BESFileLockingCache.cc:737
BESFileLockingCache::update_and_purge
virtual void update_and_purge(const std::string &new_file)
Purge files from the cache.
Definition: BESFileLockingCache.cc:940
BESFileLockingCache::unlock_cache
virtual void unlock_cache()
Definition: BESFileLockingCache.cc:686
BESUtil::lowercase
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:200
httpd_catalog::RemoteHttpResourceCache::get_cache_file_name
virtual std::string get_cache_file_name(const std::string &src, bool mangle=true)
Definition: RemoteHttpResourceCache.cc:204
BESError
Abstract exception class for the BES with basic string message.
Definition: BESError.h:58
BESFileLockingCache::cache_too_big
virtual bool cache_too_big(unsigned long long current_size) const
look at the cache size; is it too large? Look at the cache size and see if it is too big.
Definition: BESFileLockingCache.cc:780
BESFileLockingCache::exclusive_to_shared_lock
virtual void exclusive_to_shared_lock(int fd)
Transfer from an exclusive lock to a shared lock.
Definition: BESFileLockingCache.cc:630