bes  Updated for version 3.20.6
gateway_module/RemoteHttpResource.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 
3 // This file is part of gateway_module, A C++ module that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 
6 // Copyright (c) 2013 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 // Authors:
26 // ndp Nathan Potter <ndp@opendap.org>
27 
28 #include "config.h"
29 
30 #include <sstream>
31 
32 #include "BESInternalError.h"
33 
34 #include "BESDebug.h"
35 #include "BESUtil.h"
36 
37 #include "GatewayCache.h"
38 #include "GatewayUtils.h"
39 #include "curl_utils.h"
40 #include "RemoteHttpResource.h"
41 
42 using namespace std;
43 using namespace gateway;
44 
50 RemoteHttpResource::RemoteHttpResource(const string &url)
51 {
52  d_initialized = false;
53 
54  d_fd = 0;
55  d_curl = 0;
56  d_resourceCacheFileName.clear();
57  d_response_headers = new vector<string>();
58  d_request_headers = new vector<string>();
59 
60  if (url.empty()) {
61  string err = "RemoteHttpResource(): Remote resource URL is empty";
62  throw BESInternalError(err, __FILE__, __LINE__);
63  }
64 
65  d_remoteResourceUrl = url;
66 
67  BESDEBUG("gateway", "RemoteHttpResource() - URL: " << d_remoteResourceUrl << endl);
68 
69  // EXAMPLE: returned value parameter for CURL *
70  //
71  // CURL *www_lib_init(CURL **curl); // function type signature
72  //
73  // CURL *pvparam = 0; // passed value parameter
74  // result = www_lib_init(&pvparam); // the call to the method
75 
76  d_curl = init(d_error_buffer); // This may throw either Error or InternalErr
77 
78  configureProxy(d_curl, d_remoteResourceUrl); // Configure the a proxy for this url (if appropriate).
79 
80  BESDEBUG("gateway", "RemoteHttpResource() - d_curl: " << d_curl << endl);
81 }
82 RemoteHttpResource::~RemoteHttpResource()
87 {
88  BESDEBUG("gateway", "~RemoteHttpResource() - BEGIN resourceURL: " << d_remoteResourceUrl << endl);
89 
90  delete d_response_headers;
91  d_response_headers = 0;
92  BESDEBUG("gateway", "~RemoteHttpResource() - Deleted d_response_headers." << endl);
93 
94  delete d_request_headers;
95  d_request_headers = 0;
96  BESDEBUG("gateway", "~RemoteHttpResource() - Deleted d_request_headers." << endl);
97 
98  if (!d_resourceCacheFileName.empty()) {
99  GatewayCache *cache = GatewayCache::get_instance();
100  if (cache) {
101  cache->unlock_and_close(d_resourceCacheFileName);
102  BESDEBUG("gateway", "~RemoteHttpResource() - Closed and unlocked "<< d_resourceCacheFileName << endl);
103  d_resourceCacheFileName.clear();
104  }
105  }
106 
107  if (d_curl) {
108  curl_easy_cleanup(d_curl);
109  BESDEBUG("gateway", "~RemoteHttpResource() - Called curl_easy_cleanup()." << endl);
110  }
111  d_curl = 0;
112 
113  BESDEBUG("gateway", "~RemoteHttpResource() - END resourceURL: " << d_remoteResourceUrl << endl);
114  d_remoteResourceUrl.clear();
115 }
116 
124 void RemoteHttpResource::retrieveResource()
125 {
126  BESDEBUG("gateway",
127  "RemoteHttpResource::retrieveResource() - BEGIN resourceURL: " << d_remoteResourceUrl << endl);
128 
129  if (d_initialized) {
130  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - END Already initialized." << endl);
131  return;
132  }
133 
134  // Get a pointer to the singleton cache instance for this process.
135  GatewayCache *cache = GatewayCache::get_instance();
136  if (!cache) {
137  ostringstream oss;
138  oss << __func__ << "() - FAILED to get local cache."
139  " Unable to proceed with request for " << this->d_remoteResourceUrl
140  << " The gateway_module MUST have a valid cache configuration to operate." << endl;
141  BESDEBUG("gateway", oss.str());
142  throw BESInternalError(oss.str(), __FILE__, __LINE__);
143  }
144 
145  // Get the name of the file in the cache (either the code finds this file or
146  // or it makes it).
147  d_resourceCacheFileName = cache->get_cache_file_name(d_remoteResourceUrl);
148  BESDEBUG("gateway",
149  "RemoteHttpResource::retrieveResource() - d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
150 
151  // @FIXME MAKE THIS RETRIEVE THE CACHED DATA TYPE IF THE CACHED RESPONSE IF FOUND
152  // We need to know the type of the resource. HTTP headers are the preferred way to determine the type.
153  // Unfortunately, the current code losses both the HTTP headers sent from the request and the derived type
154  // to subsequent accesses of the cached object. Since we have to have a type, for now we just set the type
155  // from the url. If down below we DO an HTTP GET then the headers will be evaluated and the type set by setType()
156  // But really - we gotta fix this.
157  GatewayUtils::Get_type_from_url(d_remoteResourceUrl, d_type);
158  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - d_type: " << d_type << endl);
159 
160  try {
161 
162  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
163  BESDEBUG("gateway",
164  "RemoteHttpResource::retrieveResource() - Remote resource is already in cache. cache_file_name: " << d_resourceCacheFileName << endl);
165  d_initialized = true;
166  return;
167  }
168 
169  // Now we actually need to reach out across the interwebs and retrieve the remote resource and put it's
170  // content into a local cache file, given that it's not in the cache.
171  // First make an empty file and get an exclusive lock on it.
172  if (cache->create_and_lock(d_resourceCacheFileName, d_fd)) {
173 
174  // Write the remote resource to the cache file.
175  try {
176  writeResourceToFile(d_fd);
177  }
178  catch(...){
179  // If things went south then we need to dump the file because we'll end up with an empty/bogus file clogging the cache
180  unlink(d_resourceCacheFileName.c_str());
181  throw;
182  }
183 
184  // #########################################################################################################
185  // I think right here is where I would be able to cache the data type/response headers. While I have
186  // the exclusive lock I could open another cache file for metadata and write to it.
187  // #########################################################################################################
188 
189  // Change the exclusive lock on the new file to a shared lock. This keeps
190  // other processes from purging the new file and ensures that the reading
191  // process can use it.
192  cache->exclusive_to_shared_lock(d_fd);
193  BESDEBUG("gateway",
194  "RemoteHttpResource::retrieveResource() - Converted exclusive cache lock to shared lock." << endl);
195 
196  // Now update the total cache size info and purge if needed. The new file's
197  // name is passed into the purge method because this process cannot detect its
198  // own lock on the file.
199  unsigned long long size = cache->update_cache_info(d_resourceCacheFileName);
200  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - Updated cache info" << endl);
201 
202  if (cache->cache_too_big(size)) {
203  cache->update_and_purge(d_resourceCacheFileName);
204  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - Updated and purged cache." << endl);
205  }
206 
207  BESDEBUG("gateway", "RemoteHttpResource::retrieveResource() - END" << endl);
208 
209  d_initialized = true;
210 
211  return;
212  }
213  else {
214  if (cache->get_read_lock(d_resourceCacheFileName, d_fd)) {
215  BESDEBUG("gateway",
216  "RemoteHttpResource::retrieveResource() - Remote resource is in cache. cache_file_name: " << d_resourceCacheFileName << endl);
217  d_initialized = true;
218  return;
219  }
220  }
221 
222  string msg = "RemoteHttpResource::retrieveResource() - Failed to acquire cache read lock for remote resource: '";
223  msg += d_remoteResourceUrl + "\n";
224  throw libdap::Error(msg);
225 
226  }
227  catch (...) {
228  BESDEBUG("gateway",
229  "RemoteHttpResource::retrieveResource() - Caught exception, unlocking cache and re-throw." << endl);
230  cache->unlock_cache();
231  throw;
232  }
233 
234 }
235 
244 void RemoteHttpResource::writeResourceToFile(int fd)
245 {
246  BESDEBUG("gateway", "RemoteHttpResource::writeResourceToFile() - BEGIN" << endl);
247 
248  int status = -1;
249  try {
250  BESDEBUG("gateway",
251  "RemoteHttpResource::writeResourceToFile() - Saving resource " << d_remoteResourceUrl << " to cache file " << d_resourceCacheFileName << endl);
252  status = read_url(d_curl, d_remoteResourceUrl, fd, d_response_headers, d_request_headers,
253  d_error_buffer); // Throws Error.
254  if (status >= 400) {
255  BESDEBUG("gateway",
256  "RemoteHttpResource::writeResourceToFile() - HTTP returned an error status: " << status << endl);
257  // delete resp_hdrs; resp_hdrs = 0;
258  string msg = "Error while reading the URL: '";
259  msg += d_remoteResourceUrl;
260  msg += "'The HTTP request returned a status of " + libdap::long_to_string(status) + " which means '";
261  msg += http_status_to_string(status) + "' \n";
262  throw libdap::Error(msg);
263  }
264  BESDEBUG("gateway",
265  "RemoteHttpResource::writeResourceToFile() - Resource " << d_remoteResourceUrl << " saved to cache file " << d_resourceCacheFileName << endl);
266 
267  // rewind the file
268  // FIXME I think the idea here is that we have the file open and we should just keep
269  // reading from it. But the container mechanism works with file names, so we will
270  // likely have to open the file again. If that's true, lets remove this call. jhrg 3.2.18
271  int status = lseek(fd, 0, SEEK_SET);
272  if (-1 == status)
273  throw BESError("Could not seek within the response.", BES_NOT_FOUND_ERROR, __FILE__, __LINE__);
274 
275  BESDEBUG("gateway", "RemoteHttpResource::writeResourceToFile() - Reset file descriptor." << endl);
276 
277  // @TODO CACHE THE DATA TYPE OR THE HTTP HEADERS SO WHEN WE ARE RETRIEVING THE CACHED OBJECT WE CAN GET THE CORRECT TYPE
278  setType(d_response_headers);
279  }
280  catch (libdap::Error &e) {
281  throw;
282  }
283  BESDEBUG("gateway", "RemoteHttpResource::writeResourceToFile() - END" << endl);
284 }
285 
286 void RemoteHttpResource::setType(const vector<string> *resp_hdrs)
287 {
288 
289  BESDEBUG("gateway", "RemoteHttpResource::setType() - BEGIN" << endl);
290 
291  string type = "";
292 
293  // Try and figure out the file type first from the
294  // Content-Disposition in the http header response.
295  string disp;
296  string ctype;
297 
298  if (resp_hdrs) {
299  vector<string>::const_iterator i = resp_hdrs->begin();
300  vector<string>::const_iterator e = resp_hdrs->end();
301  for (; i != e; i++) {
302  string hdr_line = (*i);
303 
304  BESDEBUG("gateway", "RemoteHttpResource::setType() - Evaluating header: " << hdr_line << endl);
305 
306  hdr_line = BESUtil::lowercase(hdr_line);
307 
308  string colon_space = ": ";
309  int index = hdr_line.find(colon_space);
310  string hdr_name = hdr_line.substr(0, index);
311  string hdr_value = hdr_line.substr(index + colon_space.length());
312 
313  BESDEBUG("gateway",
314  "RemoteHttpResource::setType() - hdr_name: '" << hdr_name << "' hdr_value: '" <<hdr_value << "' "<< endl);
315 
316  if (hdr_name.find("content-disposition") != string::npos) {
317  // Content disposition exists
318  BESDEBUG("gateway", "RemoteHttpResource::setType() - Located content-disposition header." << endl);
319  disp = hdr_value;
320  }
321  if (hdr_name.find("content-type") != string::npos) {
322  BESDEBUG("gateway", "RemoteHttpResource::setType() - Located content-type header." << endl);
323  ctype = hdr_value;
324  }
325  }
326  }
327 
328  if (!disp.empty()) {
329  // Content disposition exists, grab the filename
330  // attribute
331  GatewayUtils::Get_type_from_disposition(disp, type);
332  BESDEBUG("gateway",
333  "RemoteHttpResource::setType() - Evaluated content-disposition '" << disp << "' matched type: \"" << type << "\"" << endl);
334  }
335 
336  // still haven't figured out the type. Check the content-type
337  // next, translate to the BES module name. It's also possible
338  // that even though Content-disposition was available, we could
339  // not determine the type of the file.
340  if (type.empty() && !ctype.empty()) {
341  GatewayUtils::Get_type_from_content_type(ctype, type);
342  BESDEBUG("gateway",
343  "RemoteHttpResource::setType() - Evaluated content-type '" << ctype << "' matched type \"" << type << "\"" << endl);
344  }
345 
346  // still haven't figured out the type. Now check the actual URL
347  // and see if we can't match the URL to a module name
348  if (type.empty()) {
349  GatewayUtils::Get_type_from_url(d_remoteResourceUrl, type);
350  BESDEBUG("gateway",
351  "RemoteHttpResource::setType() - Evaluated url '" << d_remoteResourceUrl << "' matched type: \"" << type << "\"" << endl);
352  }
353 
354  // still couldn't figure it out, punt
355  if (type.empty()) {
356  string err = (string) "RemoteHttpResource::setType() - Unable to determine the type of data"
357  + " returned from '" + d_remoteResourceUrl + "' Setting type to 'unknown'";
358  BESDEBUG("gateway", err);
359 
360  type = "unknown";
361  //throw BESSyntaxUserError( err, __FILE__, __LINE__ ) ;
362  }
363 
364  // @TODO CACHE THE DATA TYPE OR THE HTTP HEADERS SO WHEN WE ARE RETRIEVING THE CACHED OBJECT WE CAN GET THE CORRECT TYPE
365 
366  d_type = type;
367 
368  BESDEBUG("gateway", "RemoteHttpResource::setType() - END" << endl);
369 
370 }
BESFileLockingCache::get_read_lock
virtual bool get_read_lock(const std::string &target, int &fd)
Get a read-only lock on the file if it exists.
Definition: BESFileLockingCache.cc:544
BESFileLockingCache::create_and_lock
virtual bool create_and_lock(const std::string &target, int &fd)
Create a file in the cache and lock it for write access.
Definition: BESFileLockingCache.cc:599
gateway::GatewayCache
A cache for data files accessed using the Gateway.
Definition: GatewayCache.h:57
BESFileLockingCache::unlock_and_close
virtual void unlock_and_close(const std::string &target)
Definition: BESFileLockingCache.cc:713
BESInternalError
exception thrown if internal error encountered
Definition: BESInternalError.h:43
BESFileLockingCache::update_cache_info
virtual unsigned long long update_cache_info(const std::string &target)
Update the cache info file to include 'target'.
Definition: BESFileLockingCache.cc:737
BESFileLockingCache::get_cache_file_name
virtual std::string get_cache_file_name(const std::string &src, bool mangle=true)
Definition: BESFileLockingCache.cc:451
BESFileLockingCache::update_and_purge
virtual void update_and_purge(const std::string &new_file)
Purge files from the cache.
Definition: BESFileLockingCache.cc:940
BESFileLockingCache::unlock_cache
virtual void unlock_cache()
Definition: BESFileLockingCache.cc:686
BESUtil::lowercase
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:200
BESError
Abstract exception class for the BES with basic string message.
Definition: BESError.h:58
BESFileLockingCache::cache_too_big
virtual bool cache_too_big(unsigned long long current_size) const
look at the cache size; is it too large? Look at the cache size and see if it is too big.
Definition: BESFileLockingCache.cc:780
BESFileLockingCache::exclusive_to_shared_lock
virtual void exclusive_to_shared_lock(int fd)
Transfer from an exclusive lock to a shared lock.
Definition: BESFileLockingCache.cc:630