~charles/dotfiles

dotfiles/utils/query-webpage/query-webpage -rw-r--r-- 10.6 KiB
171c95a2Charles Daniels ztag: include tags in output 2 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/usr/bin/env python3

# Copyright 2020 Charles Daniels

#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:

#  1. Redistributions of source code must retain the above copyright notice,
#  this list of conditions and the following disclaimer.

#  2. Redistributions in binary form must reproduce the above copyright notice,
#  this list of conditions and the following disclaimer in the documentation
#  and/or other materials provided with the distribution.

#  3. Neither the name of the copyright holder nor the names of its
#  contributors may be used to endorse or promote products derived from this
#  software without specific prior written permission.

#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

version = "0.0.2"

descr = """Script to download the contents of a webpage, then print the results
of a specified xPath query and exit. Use cases primarily include extracting
bits of information readily located by xPath, such as counters or statuses.
One specific use case this tool has proven useful for was getting the number of
upvotes on a Reddit post. Disclaimer: this tool is intended for one off or very
low-volume queries, and includes tools to facilitate that such as user-agent
spoofing. Please be considerate of other people's servers and use a more
sophisticated tool and proper authentication (i.e. OAuth) if you plan to make
higher-volume queries against other peoples sites. If you use this tool to send
hundreds of requests per minute while spoofing your use agent to googlebot, you
will probably get IP banned from whatever you are doing, and are also very
likely  a bad person. In short, please be nice when using this tool."""

import argparse
import urllib.request
import urllib
import lxml.etree
import sys


def show_warning(msg):
    if not args.quiet:
        sys.stderr.write("WARNING: {}\n".format(msg))

    sys.stderr.flush()

def show_error(msg):
    if not args.quiet:
        sys.stderr.write("ERROR: {}\n".format(msg))

    sys.stderr.flush()
    sys.stdout.flush()
    exit(1)

def process_element(elem, extractmode, outputsep, extractarg=""):
    if extractmode == "extract_content":
        if elem.text is None:
            # Sometimes we find an element with no visible text, in which case
            # we can't really do much of anything.
            show_warning("Ignored element '{}' because it had empty .text."
                         .format(elem))
        else:
            sys.stdout.write(elem.text)
            sys.stdout.write(outputsep)

    elif extractmode == "extract_attribute":
        for item in (i for i in elem.items() if i[0] == extractarg.replace('@', '')):
            sys.stdout.write(item[1])
            sys.stdout.write(outputsep)

    elif extractmode == "raw":
        text = lxml.etree.tostring(elem, pretty_print=True)
        try:
            text = text.decode(args.encoding)
        except Exception as e:
            show_error("Failed to decode text. Exception was: '{}'.".format(e))

        if args.noexpand:
            text = "%r" % text
            # this removes the leading and trailing single quotes introduced
            # by the %r in the above statement.
            text = text[1:-1]
        sys.stdout.write(text)
        sys.stdout.write(outputsep)



common_user_agents = {
        "edge-win10": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
        "chromebook": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
        "safari-macos": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
        "chrome-win7": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
        "firefox-linux": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
        "googlebot": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
}

common_user_agent_msg = ', '.join([x for x in common_user_agents])

parser = argparse.ArgumentParser(description=descr)

parser.add_argument("--url", "-u", help="Specify target URL.", required=True)

parser.add_argument("--query", "-q", help="Specify xPath query.",
                    required=True)


parser.add_argument("--output_separator", "-s", default="\n",
                    help="Change string used to delimit matches to query in " +
                    "the output. (default: single newline)")

parser.add_argument("--disable_separator_expansion", "-E", default=False,
                    action="store_true", help="If asserted, escape " +
                    "sequences such as \\n found within the string supplied" +
                    " in --output_separator will not be expanded.")

parser.add_argument("--encoding", "-n", default="utf-8",
                    help="Specify string encoding to use where applicable." +
                    " (default: utf-8)")

parser.add_argument("--noexpand", "-x" , default=False, action="store_true",
                    help="Assert to avoid expanding control characters " +
                    "such as \\n wherever possible.")

parser.add_argument("--quiet", "-t", default=False, action="store_true",
                    help="Suppress all warning and error messages.")

parser.add_argument("--user_agent", "-a", default="Python-urllib/3.5",
                    help="Spoof an alternate user agent string." +
                    " (default: Python-urllib/3.5)")

parser.add_argument("--common_user_agent", "-A", default=None,
                    help="Select a user agent from a list of common ones. " +
                    "Valid options are: "  + common_user_agent_msg +
                    ". This overrides any value given via --user_agent")

extractgroup = parser.add_mutually_exclusive_group()

extractgroup.add_argument("--extract_attribute", "-X", default=None,
                    help="Specify an HTML attribute which should be " +
                    "extracted. Mutually exclusive with --extract_content")

extractgroup.add_argument("--extract_content", "-e", default=False,
                    action="store_true", help="Display content only as " +
                    "output, rather than HTML markup")

extractgroup.add_argument("--multiquery", "-m", default=None, nargs='+',
                    help="Each argument specifies an xpath query that will" +
                    "run relative to each result from --query. If the first " +
                    "character of a multiquery is e, then it is discarded " +
                    "and each multiquery works as if --extract_content was " +
                    "Asserted. If the first character of a multiquery is " +
                    "X, then it is discarded and all character from it to " +
                    "the next instance of ; are also discarded from the " +
                    "query, and are treated as if they were an argument " +
                    "to --extract_attribute, not including the ;. In " +
                    "multiquery mode, matches to --query are not directly " +
                    "output, and --extract_attribute and --extract_content " +
                    "cannot be specified.")

parser.add_argument("--mqsep", "-S", default='\n', help="Specify the " +
                    "separator used to delineate different multiquery " +
                    "results from one another. (default: newline)")

parser.add_argument('--version', action='version', version=version)

args = parser.parse_args()

if args.common_user_agent is not None:
    if args.common_user_agent not in common_user_agents:
        show_error("Common user agent '{}' is not one of '{}'."
                   .format(args.common_user_agent, common_user_agent_msg))
    else:
        args.user_agent = common_user_agents[args.common_user_agent]

# This allows the user to specify escape codes like \n within
# --output_separator
if not args.disable_separator_expansion:
    args.output_separator = bytes(args.output_separator, "utf-8")
    args.output_separator = args.output_separator.decode("unicode_escape")
    args.mqsep = bytes(args.mqsep, "utf-8")
    args.mqsep = args.mqsep.decode("unicode_escape")


try:
    request = urllib.request.Request(args.url, data=None, headers={
        "User-Agent": args.user_agent})
    handle = urllib.request.urlopen(request)
except urllib.error.URLError as e:
    show_error("Failed to open URL. Exception was: '{}'.".format(e))

try:
    tree = lxml.etree.HTML(handle.read())
except Exception as e:
    show_error("Failed to parse webpage. Exception was: '{}'.".format(e))

handle.close()

try:
    results = tree.xpath(args.query)
except Exception as e:
    show_error("Query failed. Exception was: '{}'.".format(e))


for r in results:

    if args.multiquery is not None:
        for mq in args.multiquery:
            extractmode = "raw"
            extractarg = ""
            query = mq
            if mq[0] == 'e':
                extractmode = "extract_content"
                query = mq[1:]
            elif mq[0] == 'X':
                extractmode = "extract_attribute"
                query = ';'.join(mq[1:].split(';')[1:])
                extractarg = mq[1:].split(';')[0]

            mqresults = []
            try:
                mqresults = r.xpath(query)
            except Exception as e:
                show_error("Query failed. Exception was: '{}'.".format(e))

            for mqr in mqresults:
                process_element(mqr, extractmode, args.mqsep, extractarg=extractarg)

        sys.stdout.write(args.output_separator)

    elif args.extract_content:
        process_element(r, "extract_content", args.output_separator)

    elif args.extract_attribute is not None:
        process_element(r, "extract_attribute", args.output_separator, extractarg=args.extract_attribute)

    else:
        process_element(r, "raw", args.output_separator)


sys.stdout.flush()