Tim Ruehsen <[email protected]> writes:
> could you create local commits (maybe you already have) and attach the output 
> of 'git format-patch -1' (-1 = last one commit, -2 = last two commits, ...) ?

I've cleaned up the documentation changes and provided a proper commit
message.

Dale
>From 14fe0982e02ee4c10b241f9e7a29fb3e5164c6d5 Mon Sep 17 00:00:00 2001
From: "Dale R. Worley" <[email protected]>
Date: Sun, 16 Oct 2016 14:44:15 -0400
Subject: [PATCH] Amend redirection behavior

* doc/wget.text: Update documentation.  Fix errors and omissions.
* src/convert.h (struct urlpos): Add link_redirect_p flag to struct urlpos to
  indicate the URL resulted from a redirection.
* src/recur.c (download_child): Suppress --no-parent check for redirection
  URLs.
* src/recur.c (download_child): Suppress directory checks for redirection
  URLs and page requisites (if -p).
* src/recur.c (descend_redirect): Set link_redirect_p flag on struct urlpos
  for redirection URLs.  Remove old test for suppressing directory checks for
  redirection URLs.
---
 doc/wget.texi | 41 +++++++++++++++++++++++++++++++++++++----
 src/convert.h |  1 +
 src/recur.c   | 53 ++++++++++++++++++++++++++++-------------------------
 3 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/doc/wget.texi b/doc/wget.texi
index f42773e..91219e5 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -2357,6 +2357,11 @@ your shell from expanding it, like in @samp{-A "*.mp3"} or @samp{-A '*.mp3'}.
 @itemx --reject-regex @var{urlregex}
 Specify a regular expression to accept or reject the complete URL.
 
+@strong{Note} that the effect of @samp{--accept-regex} and
+@samp{--reject-regex}  is suppressed for
+fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @item --regex-type @var{regextype}
 Specify the regular expression type.  Possible types are @samp{posix} or
 @samp{pcre}.  Note that to be able to use @samp{pcre} type, wget has to be
@@ -2431,18 +2436,32 @@ Specify a comma-separated list of directories you wish to follow when
 downloading (@pxref{Directory-Based Limits}).  Elements
 of @var{list} may contain wildcards.
 
+@strong{Note} that the effect of @samp{--include-directories} and
+@samp{--exclude-directories} is suppressed for
+fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @item -X @var{list}
 @itemx --exclude-directories=@var{list}
 Specify a comma-separated list of directories you wish to exclude from
 download (@pxref{Directory-Based Limits}).  Elements of
 @var{list} may contain wildcards.
 
+@strong{Note} that the effect of @samp{--include-directories} and
+@samp{--exclude-directories} is suppressed for
+fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @item -np
 @item --no-parent
-Do not ever ascend to the parent directory when retrieving recursively.
+Do not ascend to the parent directory when retrieving recursively.
 This is a useful option, since it guarantees that only the files
 @emph{below} a certain hierarchy will be downloaded.
 @xref{Directory-Based Limits}, for more details.
+
+@strong{Note} that the effect of @samp{--no-parent} is suppressed for
+fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
 @end table
 
 @c man end
@@ -2689,6 +2708,11 @@ comma-separated list, and given as an argument to @samp{-A}.
 The argument to @samp{--accept-regex} option is a regular expression which
 is matched against the complete URL.
 
+@strong{Note} that the effect of @samp{--accept-regex} and
+@samp{--reject-regex}  is suppressed for
+fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @cindex reject wildcards
 @cindex reject suffixes
 @cindex wildcards, reject
@@ -2709,9 +2733,14 @@ Analogously, to download all files except the ones beginning with
 expansion by the shell.
 @end table
 
-The argument to @samp{--accept-regex} option is a regular expression which
+The argument to @samp{--reject-regex} option is a regular expression which
 is matched against the complete URL.
 
+@strong{Note} that the effect of @samp{--accept-regex} and
+@samp{--reject-regex}  is suppressed for
+fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @noindent
 The @samp{-A} and @samp{-R} options may be combined to achieve even
 better fine-tuning of which files to retrieve.  E.g. @samp{wget -A
@@ -2778,12 +2807,16 @@ Wget offers three different options to deal with this requirement.  Each
 option description lists a short name, a long name, and the equivalent
 command in @file{.wgetrc}.
 
+@strong{Note} that the effect of all of these options is suppressed
+for fetching redirection URLs and for fetching page requisite URLs if
+@samp{--page-requisites} is specified.
+
 @cindex directories, include
 @cindex include directories
 @cindex accept directories
 @table @samp
 @item -I @var{list}
-@itemx --include @var{list}
+@itemx --include-directories @var{list}
 @itemx include_directories = @var{list}
 @samp{-I} option accepts a comma-separated list of directories included
 in the retrieval.  Any other directories will simply be ignored.  The
@@ -2801,7 +2834,7 @@ wget -I /people,/cgi-bin http://host/people/bozo/
 @cindex exclude directories
 @cindex reject directories
 @item -X @var{list}
-@itemx --exclude @var{list}
+@itemx --exclude-directories @var{list}
 @itemx exclude_directories = @var{list}
 @samp{-X} option is exactly the reverse of @samp{-I}---this is a list of
 directories @emph{excluded} from the download.  E.g. if you do not want
diff --git a/src/convert.h b/src/convert.h
index e3ff6f0..af0ab79 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -72,6 +72,7 @@ struct urlpos {
   unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
   unsigned int link_expect_html :1; /* expected to contain HTML */
   unsigned int link_expect_css  :1; /* expected to contain CSS */
+  unsigned int link_redirect_p  :1; /* the url comes from a redirection */
 
   unsigned int link_refresh_p   :1; /* link was received from
                                        <meta http-equiv=refresh content=...> */
diff --git a/src/recur.c b/src/recur.c
index 1469e31..36aee22 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -650,14 +650,15 @@ download_child (const struct urlpos *upos, struct url *parent, int depth,
   /* 4. Check for parent directory.
 
      If we descended to a different host or changed the scheme, ignore
-     opt.no_parent.  Also ignore it for documents needed to display
-     the parent page when in -p mode.  */
+     opt.no_parent.  Also ignore it for redirections and documents
+     needed to display the parent page when in -p mode.  */
   if (opt.no_parent
       && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
       && 0 == strcasecmp (u->host, start_url_parsed->host)
       && (u->scheme != start_url_parsed->scheme
           || u->port == start_url_parsed->port)
-      && !(opt.page_requisites && upos->link_inline_p))
+      && !(opt.page_requisites && upos->link_inline_p)
+      && !upos->link_redirect_p)
     {
       if (!subdir_p (start_url_parsed->dir, u->dir))
         {
@@ -668,23 +669,30 @@ download_child (const struct urlpos *upos, struct url *parent, int depth,
         }
     }
 
-  /* 5. If the file does not match the acceptance list, or is on the
-     rejection list, chuck it out.  The same goes for the directory
-     exclusion and inclusion lists.  */
-  if (opt.includes || opt.excludes)
-    {
-      if (!accdir (u->dir))
-        {
-          DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
-          reason = WG_RR_LIST;
-          goto out;
-        }
-    }
-  if (!accept_url (url))
+  /* 5. If the file does not match the acceptance regexp list, or is on the
+     rejection regexp list, chuck it out.  The same goes for the directory
+     exclusion and inclusion lists.
+
+     Ignore this test for redirections and documents needed to display
+     the parent page when in -p mode.  */
+  if (!(opt.page_requisites && upos->link_inline_p)
+      && !upos->link_redirect_p)
     {
-      DEBUGP (("%s is excluded/not-included through regex.\n", url));
-      reason = WG_RR_REGEX;
-      goto out;
+      if (opt.includes || opt.excludes)
+	{
+	  if (!accdir (u->dir))
+	    {
+	      DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
+	      reason = WG_RR_LIST;
+	      goto out;
+	    }
+	}
+      if (!accept_url (url))
+	{
+	  DEBUGP (("%s is excluded/not-included through regex.\n", url));
+	  reason = WG_RR_REGEX;
+	  goto out;
+	}
     }
 
   /* 6. Check for acceptance/rejection rules.  We ignore these rules
@@ -800,18 +808,13 @@ descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
 
   upos = xnew0 (struct urlpos);
   upos->url = new_parsed;
+  upos->link_redirect_p = 1;
 
   reason = download_child (upos, orig_parsed, depth,
                               start_url_parsed, blacklist, iri);
 
   if (reason == WG_RR_SUCCESS)
     blacklist_add (blacklist, upos->url->url);
-  else if (reason == WG_RR_LIST || reason == WG_RR_REGEX)
-    {
-      DEBUGP (("Ignoring decision for redirects, decided to load it.\n"));
-      blacklist_add (blacklist, upos->url->url);
-      reason = WG_RR_SUCCESS;
-    }
   else
     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 
-- 
1.8.3.1

Reply via email to