Very Simple Googling

Post by **The Beatles** » Mon Apr 11, 2005 11:08 pm

It's what I needed, and I hope you can use it. The script is trivial, and it starts from Google's dumbed-down IE sidebar interface anwyays. However, it's 100% accurate.

search:

code: Select all

#!/bin/bash
query=`echo $1 | ./urlencode`
pages=$2
pages=$((pages-1))
for i in `seq 0 $pages`; do
        offset=$((i*10))
        lynx -source "http://www.google.com/ie?q=$query&start=$offset" | grep NOBR | sed -e "s/.*HREF=//" | sed -e "s/>.*//" | urldecode
done

search.enc:

code: Select all

#!/bin/bash
query=$1
pages=$2
pages=$((pages-1))
for i in `seq 0 $pages`; do
        offset=$((i*10))
        lynx -source "http://www.google.com/ie?q=$query&start=$offset" | grep NOBR | sed -e "s/.*HREF=//" | sed -e "s/>.*//" | urldecode
done

urlencode:

code: Select all

:
##########################################################################
# Shellscript:  urlencode - encode URL data
# Version    :  1.2
# Author     :  Heiner Steven (heiner.steven@odn.de)
# Date       :  2000-03-15
# Categories :  File Conversion, WWW, CGI
# SCCS-Id.   :  @(#) urlencode  1.2 04/03/03
##########################################################################
# Description
#       Encode data according to
#           RFC 1738: "Uniform Resource Locators (URL)" and
#           RFC 1866: "Hypertext Markup Language - 2.0" (HTML)
#
#       This encoding is used i.e. for the MIME type
#       "application/x-www-form-urlencoded"
#
# Notes
#    o  The default behaviour is not to encode the line endings. This
#       may not be what was intended, because the result will be
#       multiple lines of output (which cannot be used in an URL or a
#       HTTP "POST" request). If the desired output should be one
#       line, use the "-l" option.
#
#    o  The "-l" option assumes, that the end-of-line is denoted by
#       the character LF (ASCII 10). This is not true for Windows or
#       Mac systems, where the end of a line is denoted by the two
#       characters CR LF (ASCII 13 10).
#       We use this for symmetry; data processed in the following way:
#               cat | urlencode -l | urldecode -l
#       should (and will) result in the original data
#
#    o  Large lines (or binary files) will break many AWK
#       implementations. If you get the message
#               awk: record `...' too long
#                record number xxx
#       consider using GNU AWK (gawk).
#
#    o  urlencode will always terminate it's output with an EOL
#       character
#
# See also
#       urldecode
##########################################################################

PN=`basename "$0"`                      # Program name
VER='1.2'

: ${AWK=awk}

Usage () {
    echo >&2 "$PN - encode URL data, $VER
usage: $PN [-l] [file ...]
    -l:  encode line endings (result will be one line of output)

The default is to encode each input line on its own."
    exit 1
}

Msg () {
    for MsgLine
    do echo "$PN: $MsgLine" >&2
    done
}

Fatal () { Msg "$@"; exit 1; }

set -- `getopt hl "$@" 2>/dev/null` || Usage
[ $# -lt 1 ] && Usage                   # "getopt" detected an error

EncodeEOL=no
while [ $# -gt 0 ]
do
    case "$1" in
        -l)     EncodeEOL=yes;;
        --)     shift; break;;
        -h)     Usage;;
        -*)     Usage;;
        *)      break;;                 # First file name
    esac
    shift
done

$AWK '
    BEGIN {
        # We assume an awk implementation that is just plain dumb.
        # We will convert an character to its ASCII value with the
        # table ord[], and produce two-digit hexadecimal output
        # without the printf("%02X") feature.

        EOL = "%0A"             # "end of line" string (encoded)
        split ("1 2 3 4 5 6 7 8 9 A B C D E F", hextab, " ")
        hextab [0] = 0
        for ( i=1; i<=255; ++i ) ord [ sprintf ("%c", i) "" ] = i + 0
        if ("'"$EncodeEOL"'" == "yes") EncodeEOL = 1; else EncodeEOL = 0
    }
    {
        encoded = ""
        for ( i=1; i<=length ($0); ++i ) {
            c = substr ($0, i, 1)
            if ( c ~ /[a-zA-Z0-9.-]/ ) {
                encoded = encoded c             # safe character
            } else if ( c == " " ) {
                encoded = encoded "+"   # special handling
            } else {
                # unsafe character, encode it as a two-digit hex-number
                lo = ord [c] % 16
                hi = int (ord [c] / 16);
                encoded = encoded "%" hextab [hi] hextab [lo]
            }
        }
        if ( EncodeEOL ) {
            printf ("%s", encoded EOL)
        } else {
            print encoded
        }
    }
    END {
        #if ( EncodeEOL ) print ""
    }
' "$@"

urldecode:

code: Select all

:
##########################################################################
# Title      :  urldecode - decode URL data
# Author     :  Heiner Steven (heiner.steven@odn.de)
# Date       :  2000-03-15
# Categories :  File Conversion, WWW, CGI
# SCCS-Id.   :  @(#) urldecode  1.4 04/03/03
##########################################################################
# Description
#       Decode data according to
#           RFC 1738: "Uniform Resource Locators (URL)" and
#           RFC 1866: "Hypertext Markup Language - 2.0" (HTML)
#           RFC 2396: "Uniform Resource Identifiers (URI): Generic Syntax"
#
#       This encoding is used i.e. for the MIME type
#       "application/x-www-form-urlencoded"
#
# Notes
#    o  The default behaviour is to decode each line independently of the
#       other, and print the results on an own line. If the line
#       endings are encoded, too (i.e. "%0A" or "%0D%0A"), use the
#       "-l" option to prevent urldecode from adding additional line
#       endings.
#
#    o  Large lines (or binary files) will break many AWK
#       implementations. If you get the message like
#               awk: record `%3A%0A%23%23%23%23%2...' too long
#       consider using GNU AWK (gawk).
#       If the input line was generated using "urlencode -l", try to
#       omit the "-l" option, if applicable.
#
# See also
#       urlencode
##########################################################################

PN=`basename "$0"`                      # Program name
VER='1.4'

: ${AWK:=awk}

Usage () {
    echo >&2 "$PN - decode URL data, $VER
usage: $PN [-l] [file ...]
    -l:  single-line input (line endings are encoded)"
    exit 1
}

Msg () {
    for MsgLine
    do echo "$PN: $MsgLine" >&2
    done
}

Fatal () { Msg "$@"; exit 1; }

set -- `getopt hl "$@" 2>/dev/null` || Usage
[ $# -lt 1 ] && Usage                   # "getopt" detected an error

EncodedLF=no
while [ $# -gt 0 ]
do
    case "$1" in
        -l)     EncodedLF=yes;;
        --)     shift; break;;
        -h)     Usage;;
        -*)     Usage;;
        *)      break;;                 # First file name
    esac
    shift
done

$AWK '
    BEGIN {
        hextab ["0"] = 0;       hextab ["8"] = 8;
        hextab ["1"] = 1;       hextab ["9"] = 9;
        hextab ["2"] = 2;       hextab ["A"] = hextab ["a"] = 10
        hextab ["3"] = 3;       hextab ["B"] = hextab ["b"] = 11;
        hextab ["4"] = 4;       hextab ["C"] = hextab ["c"] = 12;
        hextab ["5"] = 5;       hextab ["D"] = hextab ["d"] = 13;
        hextab ["6"] = 6;       hextab ["E"] = hextab ["e"] = 14;
        hextab ["7"] = 7;       hextab ["F"] = hextab ["f"] = 15;
        if ("'"$EncodedLF"'" == "yes") EncodedLF = 1; else EncodedLF = 0
    }
    {
        decoded = ""
        i   = 1
        len = length ($0)
        while ( i <= len ) {
            c = substr ($0, i, 1)
            if ( c == "%" ) {
                if ( i+2 <= len ) {
                    c1 = substr ($0, i+1, 1)
                    c2 = substr ($0, i+2, 1)
                    if ( hextab [c1] == "" || hextab [c2] == "" ) {
                        print "WARNING: invalid hex encoding: %" c1 c2 | \
                                "cat >&2"
                    } else {
                        code = 0 + hextab [c1] * 16 + hextab [c2] + 0
                        #print "\ncode=", code
                        c = sprintf ("%c", code)
                        i = i + 2
                    }
                } else {
                    print "WARNING: invalid % encoding: " substr ($0, i, len - i)
                }
            } else if ( c == "+" ) {    # special handling: "+" means " "
                c = " "
            }
            decoded = decoded c
            ++i
        }
        if ( EncodedLF ) {
            printf "%s", decoded        # no line newline on output
        } else {
            print decoded
        }
    }
' "$@"

urlencode and urldecode are not mine, obviously. Search accepts two arguments, one is '1' by default. First is the search term, second is the number of pages (10 hits each). search.enc is the same, but doesn't urlencode your query. I needed this for the camera searches (windy, Aus).
Stick them all in one directory to use, and chmod a+x * them.

Frost And Flame Forums

Very Simple Googling

Members connected in real time