bes  Updated for version 3.20.6
cmr_module/RemoteHttpResource.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 
3 // This file is part of cmr_MODULE, A C++ MODULE that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 
6 // Copyright (c) 2013 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 // Authors:
26 // ndp Nathan Potter <ndp@opendap.org>
27 
28 #include "config.h"
29 
30 #include <sstream>
31 #include <fstream>
32 #include <string>
33 #include <iostream>
34 
35 #include "BESInternalError.h"
36 
37 #include "BESDebug.h"
38 #include "BESUtil.h"
39 
40 #include "CmrNames.h"
41 #include "CmrCache.h"
42 #include "CmrUtils.h"
43 #include "curl_utils.h"
44 #include "RemoteHttpResource.h"
45 
46 using namespace std;
47 using namespace cmr;
48 
49 #define prolog std::string("RemoteHttpResource::").append(__func__).append("() - ")
50 
51 
57 RemoteHttpResource::RemoteHttpResource(const string &url) {
58  d_initialized = false;
59  d_fd = 0;
60  d_curl = 0;
61  d_resourceCacheFileName.clear();
62  d_response_headers = new vector<string>();
63  d_request_headers = new vector<string>();
64  d_http_response_headers = new map<string,string>();
65 
66  if (url.empty()) {
67  string err = "RemoteHttpResource(): Remote resource URL is empty";
68  throw BESInternalError(err, __FILE__, __LINE__);
69  }
70 
71  d_remoteResourceUrl = url;
72 
73  BESDEBUG(MODULE, prolog << "URL: " << d_remoteResourceUrl << endl);
74 
75  // EXAMPLE: returned value parameter for CURL *
76  //
77  // CURL *www_lib_init(CURL **curl); // function type signature
78  //
79  // CURL *pvparam = 0; // passed value parameter
80  // result = www_lib_init(&pvparam); // the call to the method
81 
82  d_curl = init(d_error_buffer); // This may throw either Error or InternalErr
83 
84  configureProxy(d_curl, d_remoteResourceUrl); // Configure the a proxy for this url (if appropriate).
85 
86  BESDEBUG(MODULE, prolog << "d_curl: " << d_curl << endl);
87 }
88 RemoteHttpResource::~RemoteHttpResource()
93 {
94  BESDEBUG(MODULE, prolog << "BEGIN resourceURL: " << d_remoteResourceUrl << endl);
95 
96  delete d_response_headers;
97  d_response_headers = 0;
98  BESDEBUG(MODULE, prolog << "Deleted d_response_headers." << endl);
99 
100  delete d_request_headers;
101  d_request_headers = 0;
102  BESDEBUG(MODULE, prolog << "Deleted d_request_headers." << endl);
103 
104  if (!d_resourceCacheFileName.empty()) {
105  CmrCache *cache = CmrCache::get_instance();
106  if (cache) {
107  cache->unlock_and_close(d_resourceCacheFileName);
108  BESDEBUG(MODULE, prolog << "Closed and unlocked "<< d_resourceCacheFileName << endl);
109  d_resourceCacheFileName.clear();
110  }
111  }
112 
113  if (d_curl) {
114  curl_easy_cleanup(d_curl);
115  BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << endl);
116  }
117  d_curl = 0;
118 
119  BESDEBUG(MODULE, prolog << "END resourceURL: " << d_remoteResourceUrl << endl);
120  d_remoteResourceUrl.clear();
121 }
122 
130 void RemoteHttpResource::retrieveResource()
131 {
132  BESDEBUG(MODULE, prolog << "BEGIN resourceURL: " << d_remoteResourceUrl << endl);
133 
134  if (d_initialized) {
135  BESDEBUG(MODULE, prolog << "END Already initialized." << endl);
136  return;
137  }
138 
139  // Get a pointer to the singleton cache instance for this process.
140  CmrCache *cache = CmrCache::get_instance();
141  if (!cache) {
142  ostringstream oss;
143  oss << __func__ << "() - FAILED to get local cache."
144  " Unable to proceed with request for " << this->d_remoteResourceUrl
145  << " The cmr_module MUST have a valid cache configuration to operate." << endl;
146  BESDEBUG(MODULE, oss.str());
147  throw BESInternalError(oss.str(), __FILE__, __LINE__);
148  }
149 
150  // Get the name of the file in the cache (either the code finds this file or
151  // or it makes it).
152  d_resourceCacheFileName = cache->get_cache_file_name(d_remoteResourceUrl);
153  BESDEBUG(MODULE, prolog << "d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
154 
155  // @TODO MAKE THIS RETRIEVE THE CACHED DATA TYPE IF THE CACHED RESPONSE IF FOUND
156  // We need to know the type of the resource. HTTP headers are the preferred way to determine the type.
157  // Unfortunately, the current code losses both the HTTP headers sent from the request and the derived type
158  // to subsequent accesses of the cached object. Since we have to have a type, for now we just set the type
159  // from the url. If down below we DO an HTTP GET then the headers will be evaluated and the type set by setType()
160  // But really - we gotta fix this.
161  CmrUtils::Get_type_from_url(d_remoteResourceUrl, d_type);
162  BESDEBUG(MODULE, prolog << "d_type: " << d_type << endl);
163 
164  try {
165  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
166  BESDEBUG(MODULE, prolog << "Remote resource is already in cache. cache_file_name: " << d_resourceCacheFileName << endl);
167 
168  // #########################################################################################################
169  // I think in this if() is where we need to load the headers from the cache if we have them.
170  string hdr_filename = cache->get_cache_file_name(d_remoteResourceUrl) + ".hdrs";
171  std::ifstream hdr_ifs(hdr_filename.c_str());
172  try {
173  BESDEBUG(MODULE, prolog << "Reading response headers from: " << hdr_filename << endl);
174  for (std::string line; std::getline(hdr_ifs, line); ){
175  (*d_response_headers).push_back(line);
176  BESDEBUG(MODULE, prolog << "header: " << line << endl);
177  }
178  }
179  catch(...){
180  hdr_ifs.close();
181  throw;
182  }
183  ingest_http_headers_and_type();
184  d_initialized = true;
185  return;
186  // #########################################################################################################
187  }
188 
189  // Now we actually need to reach out across the interwebs and retrieve the remote resource and put it's
190  // content into a local cache file, given that it's not in the cache.
191  // First make an empty file and get an exclusive lock on it.
192  if (cache->create_and_lock(d_resourceCacheFileName, d_fd)) {
193 
194  // Write the remote resource to the cache file.
195  try {
196  writeResourceToFile(d_fd);
197  }
198  catch(...){
199  // If things went south then we need to dump the file because we'll end up with an empty/bogus file clogging the cache
200  unlink(d_resourceCacheFileName.c_str());
201  throw;
202  }
203 
204  // #########################################################################################################
205  // I think right here is where I would be able to cache the data type/response headers. While I have
206  // the exclusive lock I could open another cache file for metadata and write to it.
207  {
208  string hdr_filename = cache->get_cache_file_name(d_remoteResourceUrl) + ".hdrs";
209  std::ofstream hdr_out(hdr_filename.c_str());
210  try {
211  for(size_t i=0; i<this->d_response_headers->size() ;i++){
212  hdr_out << (*d_response_headers)[i] << endl;
213  }
214  }
215  catch (...) {
216  // If this fails for any reason we:
217  hdr_out.close(); // Close the stream
218  unlink(hdr_filename.c_str()); // unlink the file
219  unlink(d_resourceCacheFileName.c_str()); // unlink the primary cache file.
220  throw;
221  }
222  }
223  // #########################################################################################################
224 
225  // Change the exclusive lock on the new file to a shared lock. This keeps
226  // other processes from purging the new file and ensures that the reading
227  // process can use it.
228  cache->exclusive_to_shared_lock(d_fd);
229  BESDEBUG(MODULE, prolog << "Converted exclusive cache lock to shared lock." << endl);
230 
231  // Now update the total cache size info and purge if needed. The new file's
232  // name is passed into the purge method because this process cannot detect its
233  // own lock on the file.
234  unsigned long long size = cache->update_cache_info(d_resourceCacheFileName);
235  BESDEBUG(MODULE, prolog << "Updated cache info" << endl);
236 
237  if (cache->cache_too_big(size)) {
238  cache->update_and_purge(d_resourceCacheFileName);
239  BESDEBUG(MODULE, prolog << "Updated and purged cache." << endl);
240  }
241  BESDEBUG(MODULE, prolog << "END" << endl);
242  d_initialized = true;
243  return;
244  }
245  else {
246  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
247  BESDEBUG(MODULE, prolog << "Remote resource is in cache. cache_file_name: " << d_resourceCacheFileName << endl);
248  d_initialized = true;
249  return;
250  }
251  }
252 
253  string msg = prolog + "Failed to acquire cache read lock for remote resource: '";
254  msg += d_remoteResourceUrl + "\n";
255  throw libdap::Error(msg);
256 
257  }
258  catch (...) {
259  BESDEBUG(MODULE,
260  "RemoteHttpResource::retrieveResource() - Caught exception, unlocking cache and re-throw." << endl);
261  cache->unlock_cache();
262  throw;
263  }
264 
265 }
266 
275 void RemoteHttpResource::writeResourceToFile(int fd) {
276  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
277 
278  int status = -1;
279  try {
280  BESDEBUG(MODULE,
281  "RemoteHttpResource::writeResourceToFile() - Saving resource " << d_remoteResourceUrl << " to cache file " << d_resourceCacheFileName << endl);
282 
283  status = read_url(d_curl, d_remoteResourceUrl, fd, d_response_headers, d_request_headers, d_error_buffer); // Throws Error.
284 
285  if (status >= 400) {
286  BESDEBUG(MODULE, prolog << "HTTP returned an error status: " << status << endl);
287  // delete resp_hdrs; resp_hdrs = 0;
288  string msg = "Error while reading the URL: '";
289  msg += d_remoteResourceUrl;
290  msg += "'The HTTP request returned a status of " + libdap::long_to_string(status) + " which means '";
291  msg += http_status_to_string(status) + "' \n";
292  throw libdap::Error(msg);
293  }
294  BESDEBUG(MODULE, prolog << "Resource " << d_remoteResourceUrl << " saved to cache file " << d_resourceCacheFileName << endl);
295 
296  // rewind the file
297  // FIXME I think the idea here is that we have the file open and we should just keep
298  // reading from it. But the container mechanism works with file names, so we will
299  // likely have to open the file again. If that's true, lets remove this call. jhrg 3.2.18
300  int status = lseek(fd, 0, SEEK_SET);
301  if (-1 == status)
302  throw BESError("Could not seek within the response.", BES_NOT_FOUND_ERROR, __FILE__, __LINE__);
303  BESDEBUG(MODULE, prolog << "Reset file descriptor." << endl);
304 
305  ingest_http_headers_and_type();
306  }
307  catch (libdap::Error &e) {
308  throw;
309  }
310  BESDEBUG(MODULE, prolog << "END" << endl);
311 }
312 
313 
314 void RemoteHttpResource::ingest_http_headers_and_type(){
315  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
316 
317  const string colon_space = ": ";
318  for(size_t i=0; i<this->d_response_headers->size() ;i++){
319  size_t colon_index = (*d_response_headers)[i].find(colon_space);
320  string key = BESUtil::lowercase((*d_response_headers)[i].substr(0,colon_index));
321  string value = (*d_response_headers)[i].substr(colon_index + colon_space.length());
322  BESDEBUG(MODULE, prolog << "key: " << key << " value: " << value << endl);
323  (*d_http_response_headers)[key] = value;
324  }
325  string type;
326 
327  // Try and figure out the file type first from the
328  // Content-Disposition in the http header response.
329  string cdisp_hdr;
330  string ctype_hdr;
331  std::map<string,string>::iterator it;
332 
333  it = d_http_response_headers->find("content-disposition");
334  if(it != d_http_response_headers->end()){
335  cdisp_hdr = it->second;
336  }
337 
338  it = d_http_response_headers->find("content-type");
339  if(it != d_http_response_headers->end()){
340  ctype_hdr = it->second;
341  }
342 
343  if (!cdisp_hdr.empty()) {
344  // Content disposition exists, grab the filename
345  // attribute
346  CmrUtils::Get_type_from_disposition(cdisp_hdr, type);
347  BESDEBUG(MODULE,prolog << "Evaluated content-disposition '" << cdisp_hdr << "' matched type: \"" << type << "\"" << endl);
348  }
349 
350  // still haven't figured out the type. Check the content-type
351  // next, translate to the BES MODULE name. It's also possible
352  // that even though Content-disposition was available, we could
353  // not determine the type of the file.
354  if (type.empty() && !ctype_hdr.empty()) {
355  CmrUtils::Get_type_from_content_type(ctype_hdr, type);
356  BESDEBUG(MODULE,prolog << "Evaluated content-type '" << ctype_hdr << "' matched type \"" << type << "\"" << endl);
357  }
358 
359  // still haven't figured out the type. Now check the actual URL
360  // and see if we can't match the URL to a MODULE name
361  if (type.empty()) {
362  CmrUtils::Get_type_from_url(d_remoteResourceUrl, type);
363  BESDEBUG(MODULE,prolog << "Evaluated url '" << d_remoteResourceUrl << "' matched type: \"" << type << "\"" << endl);
364  }
365 
366  // still couldn't figure it out, punt
367  if (type.empty()) {
368  string err = prolog + "Unable to determine the type of data"
369  + " returned from '" + d_remoteResourceUrl + "' Setting type to 'unknown'";
370  BESDEBUG(MODULE, err << endl);
371  type = "unknown";
372  //throw BESSyntaxUserError( err, __FILE__, __LINE__ ) ;
373  }
374  d_type = type;
375  BESDEBUG(MODULE, prolog << "END (dataset type: "<< d_type << ")" << endl);
376 }
377 
383 std::string
384 RemoteHttpResource::get_http_response_header(const std::string header_name){
385  string value("");
386  std::map<string,string>::iterator it;
387  it = d_http_response_headers->find( BESUtil::lowercase(header_name));
388  if(it != d_http_response_headers->end())
389  value = it->second;
390  return value;
391 }
392 
BESFileLockingCache::get_read_lock
virtual bool get_read_lock(const std::string &target, int &fd)
Get a read-only lock on the file if it exists.
Definition: BESFileLockingCache.cc:544
BESFileLockingCache::create_and_lock
virtual bool create_and_lock(const std::string &target, int &fd)
Create a file in the cache and lock it for write access.
Definition: BESFileLockingCache.cc:599
BESFileLockingCache::unlock_and_close
virtual void unlock_and_close(const std::string &target)
Definition: BESFileLockingCache.cc:713
BESInternalError
exception thrown if internal error encountered
Definition: BESInternalError.h:43
BESFileLockingCache::update_cache_info
virtual unsigned long long update_cache_info(const std::string &target)
Update the cache info file to include 'target'.
Definition: BESFileLockingCache.cc:737
BESFileLockingCache::update_and_purge
virtual void update_and_purge(const std::string &new_file)
Purge files from the cache.
Definition: BESFileLockingCache.cc:940
BESFileLockingCache::unlock_cache
virtual void unlock_cache()
Definition: BESFileLockingCache.cc:686
BESUtil::lowercase
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:200
cmr::CmrCache
A cache for content accessed via the CMR.
Definition: CmrCache.h:56
cmr::CmrCache::get_cache_file_name
virtual std::string get_cache_file_name(const std::string &src, bool mangle=true)
Definition: CmrCache.cc:215
BESError
Abstract exception class for the BES with basic string message.
Definition: BESError.h:58
BESFileLockingCache::cache_too_big
virtual bool cache_too_big(unsigned long long current_size) const
look at the cache size; is it too large? Look at the cache size and see if it is too big.
Definition: BESFileLockingCache.cc:780
BESFileLockingCache::exclusive_to_shared_lock
virtual void exclusive_to_shared_lock(int fd)
Transfer from an exclusive lock to a shared lock.
Definition: BESFileLockingCache.cc:630