/*
   UniPage v0.4 2008/02/06 by Alfredo Fernndez Daz <mrwarper@yahoo.es>

   Merges multi-component HTML documents in one single file

   License: Free. Take everything you need and don't even thank me.
   Well, actually it would be nice if you thanked me ;-)
*/

'@Echo Off'
if RxFuncQuery('SysLoadFuncs') then do
  call RxFuncAdd 'SysLoadFuncs', 'RexxUtil', 'SysLoadFuncs'
  call SysLoadFuncs
 end

parse source HostOS CallType ThisFile
if (HostOS = 'OS/2' | HostOS = 'WIN32') then
  dir_sep = '\'
else
  dir_sep = '/'

/* Global settings */

VerStr     = 'UniPage v0.4'
UserAgent  = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)'
filelist.0 = 0
bad_uris.0 = 0
lastcount  = 0

/* Select run mode */

http_host  = value('HTTP_HOST',,'ENVIRONMENT')
td         = directory()

/* CGI mode */

if http_host <> '' then do

  output = 'cgi'
  cgi_output = ''

  call msg_out verstr||', working on '||date('s')||' at: '||http_host||' '||td,0
  call msg_out '-----------------------------------------------------------',0

  query_string = value('QUERY_STRING',,'ENVIRONMENT')
  parse var query_string command '=' main_doc
  if command <> 'url' then do
    call msg_out 'Syntax error: it is ...unipage.cmd?url=scheme%3A%2F%2F...',1
    call msg_out 'Example: ...unipage.cmd?url=http%3A%2F%2Fwww.11ipc.org%2Findex.htm',1
    call bomb_out 0
   end
  do while pos('%',main_doc)>0
    n=pos('%',main_doc)
    main_doc=substr(main_doc,1,n-1)||x2c(substr(main_doc,n+1,2))||substr(main_doc,n+3)
   end

  call main
  exit 0

 end

/* Local mode (cmd line) - else/do are not needed 'cause we bombed_out before */

output = 'local'
parse arg parameters
parameters = strip(parameters)

if parameters = '' then do
  call msg_out verstr||', working on '||date('s')||' from: '||td,1
  call msg_out '-----------------------------------------------------------',0
  call msg_out '',1
  call msg_out 'Usage:    "UniPage [<filespec> [<base URI>] [/S]]"',1
  call msg_out '    or    "UniPage [<URI(*)>]"',1
  call msg_out '',1
  call msg_out '(*) Any "//"s have to be changed to "\\" in every URI for',1
  call msg_out '  command-line usage',1
  call msg_out '',1
  call msg_out 'Examples: "UniPage *.htm"',1
  call msg_out '          "UniPage *.htm /S"',1
  call msg_out '          "UniPage index.htm http:\\www.11ipc.org/"',1
  call msg_out '          "UniPage *.htm http:\\www.11ipc.org/ /S"',1
  call msg_out '          "Unipage http:\\www.11ipc.org/index.htm"',1
  call msg_out '',1
  call msg_out 'No parameters specified. Please enter one set of parameters per line.',0
  call msg_out 'Enter a blank line to end entering parameters and begin to process: ',0
  i=0
  do until new = ''
    parse pull new
    if new <> '' then do
      i=i+1
      parameters.i = new
     end
   end
  parameters.0 = i
  if parameters.0 = 0 then
    exit 1
  else do
    call msg_out '',0
    call msg_out 'Read URI/filespec list: '||parameters.0||' line(s).',0
   end  
 end
else do
  parameters.0 = 1
  parameters.1 = parameters
 end

call time 'E'

call msg_out '',0
call msg_out verstr||', working on '||date('s')||' from: '||td,0
call msg_out '-----------------------------------------------------------',0

do i=1 to parameters.0
  parameters = strip(parameters.i)
  if left(parameters,1)='"' then /* necessary when & is present in URIs */
    parse var parameters '"' spec '"' rest
  else 
    parse var parameters spec rest
  rest = strip(rest)
  rest = subst(rest,'\','/',0)  /* Just in case someone typed \s */
  if left(rest,1)='"' then      /* necessary when & is present in URIs */
    parse var rest '"' two '"' three
  else 
    parse var rest two three
  three = strip(three)
  three = strip(three,,'"')

  if (translate(two) = '/S') | (translate(three) = '/S') then
    flags = 'FOS'
  else
    flags = 'FO'

  /* Base URI for local documents - will be used for missing files */
  select
    when pos('://',two)>0 then
      root_uri = two
    when pos('://',three)>0 then
      root_uri = three
    otherwise
      root_uri = ''
   end

  /* Download from URI or get local file(s) and start unifying... */

  /* \ is not allowed in URIs -> so if it's present in the args,
     it is to avoid the REXX '//' inconvenience
  */
  if pos(':\\',spec)>0 then
    spec = subst(spec,'\','/')

  if pos('://',spec)>0 then do  
    parse var spec scheme '://' host '/' path
    path = '/'||path  /* So we avoid the damned costume to skip the trailing '/' */
    main_doc = scheme||'://'||host||path
    call msg_out 'Processing file '||main_doc,0
    call msg_out '-----------------------------------------------------------',0
    call main
    call msg_out 'Done, '||format(time('R'),,2)||'s. ',0
   end
  else do
    /* Really local mode - even full filenames are considered as filters */
    call SysFileTree spec,victims,flags
    do l=1 to victims.0
      main_doc = victims.l
      parse var victims.l (td) (dir_sep) victims.l
      call msg_out 'Processing file '||right(l,length(victims.0),' ')||'/'||victims.0||': '||victims.l,0
      call msg_out '-----------------------------------------------------------',0
      call main
      call msg_out 'Done, '||format(time('R'),,2)||'s. ',0
      call msg_out '',0
     end
   end

 end

exit 0

main:
  /* index already in use: i */
  drop root_htm
  gotit = get_new_file(main_doc,'')
  if gotit <> 0 then do
    filelist.gotit.ftyp     = 'htm'
    filelist.gotit.base_uri = root_uri
    root_htm = gotit
    /*
       it's very hard to implement recursivity w/o real
       local variables so this is a cheap workaround
    */
    do until lastcount = filelist.0
      lwlim = lastcount+1
      lastcount = filelist.0
      do j=lwlim to filelist.0
        select
          when filelist.j.ftyp = 'htm' then
            call htm_prepare j
          when filelist.j.ftyp = 'css' then
            call css_prepare j,1
          otherwise nop
         end
       end
     end
    call msg_out '-Done parsing files. Time elapsed: '||format(time('E'),,2),0
    do j=filelist.0 to 1 by -1
      select
        when filelist.j.ftyp = 'htm' then
          call htm_subst j
        when filelist.j.ftyp = 'css' then
          call css_prepare j,2
        otherwise nop
       end
     end
    call output main_doc,filelist.root_htm.guts
   end
return

htm_prepare:
  parse arg n
  /* index already in use: i,j */
  if filelist.n.stage > 0 then
    return
  call msg_out 'Parsing '||filelist.n.uri,0

  /* Check if we're being base-redirected... */
  redir = ''
  call htmltaghunt filelist.n.guts,base_redir,'base.href'
  select
    when base_redir.0 = 1 then
      redir = base_redir.1
    when base_redir.0 > 1 then do
      call msg_out 'Warning: multiple <base > redirectors found... '
      redir = base_redir.1
     end
    otherwise nop /* =0 as usual... */
   end
  if redir <> '' then do
    parse var redir offset length .
    base_redir = substr(htmldata,offset,length)
    call htmlattrlist base_redir,'base_attrs'
    do k=1 to base_attrs.0 
      parse var base_attrs.k attrname attrvalue
      if attrname = 'href' then do
        if filelist.n.base_uri <> '' then
          call msg_out 'Warning: base URI conflict - was: '||filelist.n.base_uri,1
        call msg_out 'Base URI changed to: '||attrvalue,0
        filelist.n.base_uri = attrvalue
        leave
       end
     end
   end

  filelist.n.link.0 = 0
  call htmltaghunt filelist.n.guts,linktag,'style link.href script.src img.src input.src frame.src iframe.src'

  do k=1 to linktag.0
    parse var linktag.k offset length tag
    html_code = substr(filelist.n.guts,offset,length)
    call htmlattrlist html_code,'attrlist'
    drop src href
    do x=1 to attrlist.0
      parse var attrlist.x attrname attrvalue
      select
        when attrname = 'src' then
          src = attrvalue
        when attrname = 'href' then
          href = attrvalue
        when attrname = 'rel' then
          rel = attrvalue
        otherwise nop
       end
     end
    select
      /* First process the damned embedded styles */
      when tag = 'style' then do
        style_end = pos('</STYLE>',translate(filelist.n.guts),offset+length)
        if style_end > 0 then do
          embedded_css = substr(filelist.n.guts,offset+length,style_end-offset-length)
          call css_urlhunt embedded_css,'embedded_uris'
          if embedded_uris.0 > 0 then do
            /*
               We're going to fake this to ourselves, and make it appear as a
               regular file to be processed and inserted here in the end...
            */
            csspos = filelist.0 +1
            filelist.csspos.uri  = filelist.n.uri
            filelist.csspos.guts = embedded_css
            filelist.csspos.ftyp = 'css'
            filelist.csspos.stage= 0
            filelist.0 = csspos
            /* Now insert in this file's links list */
            m = filelist.n.link.0 +1
            filelist.n.link.0 = m
            /* we have to substitute the whole <style </style> block
               -> a special code with 4 parts is a qnd solution */
            filelist.n.link.m = offset+length||' '||length(embedded_css)||' @css '||csspos
            filelist.n.link.m.index = csspos
           end
         end
        else
          call msgout 'Warning: <style> not closed at '||offset||' - HTML code: '||html_code,1
       end
      when tag = 'link' then
        select
          when (pos('ICON',translate(rel))>0 | pos('SHORTCUT',translate(rel))>0) then
            call htm_prepare_get_n_mark href,'image'
          when pos('STYLESHEET',translate(rel))>0 then
            call htm_prepare_get_n_mark href,'css'
          otherwise /* we don't know how to manage this -> don't pull more files */
            call msg_out 'Warning: unknown <link ... > type at '||offset||' - HTML code: '||html_code,1
         end
      when tag = 'script' then
        call htm_prepare_get_n_mark src,'script'
      when (tag = 'img' | tag = 'input' ) then
        call htm_prepare_get_n_mark src,'image'
      when (tag = 'frame' | tag = 'iframe') then
        call htm_prepare_get_n_mark src,'htm'
      otherwise /* we weren't looking for any other tags... */
        call msg_out 'Warning: unexpected tag at '||offset||' - HTML code: '||html_code,1
     end                   /* end filtering out tags */
   end                    /* for each linker tag */
  filelist.n.stage = 1   /* 0 = raw, 1 = ready for substitution, 2 = done */
return

htm_prepare_get_n_mark:
  parse arg htmpgm_url,htmpgm_type
  if filelist.n.base_uri = '' then
    gotit = get_new_file(addurl(filelist.n.uri,htmpgm_url),filelist.n.uri)
  else
  /*
     base_uri is OK to look first for remote files, but for locals we should
     try to find local files first
  */
    if pos('://',filelist.n.uri) >0 then do
      gotit = get_new_file(addurl(filelist.n.base_uri,htmpgm_url),filelist.n.base_uri)
      if gotit = 0 then
        gotit = get_new_file(addurl(filelist.n.uri,htmpgm_url),filelist.n.uri)
     end
    else do
      gotit = get_new_file(addurl(filelist.n.uri,htmpgm_url),filelist.n.uri)
      if gotit = 0 then
        gotit = get_new_file(addurl(filelist.n.base_uri,htmpgm_url),filelist.n.base_uri)
     end
  if gotit > 0 then do
    select
      when htmpgm_type = 'image' then
        if imagetype(gotit) = 'image' then
          call msg_out 'Warning - unknown image type',1
      when htmpgm_type = 'htm' then
        filelist.gotit.base_uri = ''
      otherwise nop 
     end
    /*
       The links list for filelist.n has to be updated here because we're
       updating it manually for embedded CSSs, and for everything else we call
       this function
    */
    filelist.gotit.ftyp = htmpgm_type
    m = filelist.n.link.0 +1
    filelist.n.link.0 = m
    filelist.n.link.m = linktag.k
    filelist.n.link.m.index = gotit
   end
return

htm_subst:
  parse arg n
  /* index already in use: i,j */
  if filelist.n.stage > 1 then
    return
  call msg_out 'Build HTML: '||filelist.n.uri,0
  do k=filelist.n.link.0 to 1 by -1
    findex = filelist.n.link.k.index
    if findex > 0 then do /* Only replace code if we have what with */

      parse var filelist.n.link.k offset length ftag extra

      if ftag = '@css' then do
        if filelist.extra.stage < 2 then
          call msg_out 'Warning - re-merging unparsed CSS '||,
            'into "'||filelist.n.uri||'"',1
        newcode = filelist.extra.guts
       end
      else
        call htmlattrlist substr(filelist.n.guts,offset,length),'fattrlist'

      select

        when ftag = 'link' then do  /* Re-select: only Icons &/| StyleSheet s */
          rel = html_getattr('rel')
          select

            when (pos('ICON',translate(rel))>0 | pos('SHORTCUT',translate(rel))>0) then do
              forgetattrlist = 'type'
              href = 'data:'||imagetype(findex)||';'
              href = href||'base64,'||enbase64(filelist.findex.guts)
              call html_updattr 'href',href
              newcode = '<link'
              call html_dumpattrs
              newcode = newcode||'>'
             end

            when pos('STYLESHEET',translate(rel))>0 then do
              if filelist.findex.stage < 2 then /* Debug code still around... */
                call msg_out 'Warning - merging unparsed file: "'||,
                  filelist.findex.uri||'" into "'||filelist.n.uri||'"',1
              forgetattrlist = 'href rel /'
              type = html_getattr('type')
              if type = '' then
                call SysStemInsert fattrlist,fattrlist.0,'type text/css'
              newcode = '<style'
              call html_dumpattrs
              newcode = newcode||'>'||'0D0A'x
              newcode = newcode||filelist.findex.guts
              newcode = newcode||'0D0A'x||'</style>'
             end

            otherwise /* <link > with unknown rel= */
              nop /* We gave a warning before */
           end
         end

        when ftag = 'script' then do  /* This is automatic */
          forgetattrlist = 'src'
          type = html_getattr('type')
          if type = '' then
            call SysStemInsert fattrlist,fattrlist.0,'type text/javascript'
          newcode = '<script'
          call html_dumpattrs
          newcode = newcode||'>'
          newcode = newcode||'0D0A'x
          newcode = newcode||filelist.findex.guts
          newcode = newcode||'0D0A'x
         end

        when ftag = 'img' then do
          forgetattrlist = 'type'
          src = 'data:'||imagetype(findex)||';'
          src = src||'base64,'||enbase64(filelist.findex.guts)
          call html_updattr 'src',src
          newcode = '<img'
          call html_dumpattrs
          newcode = newcode||'>'
         end

        when ftag = 'input' then do
          forgetattrlist = ''
          src = 'data:'||imagetype(findex)||';'
          src = src||'base64,'||enbase64(filelist.findex.guts)
          call html_updattr 'src',src
          newcode = '<input'
          call html_dumpattrs
          newcode = newcode||'>'
         end

        when ftag = 'frame' then do
          if filelist.findex.stage < 2 then /* Debug code still here */
            call msg_out 'Warning - merging unparsed file: "'||,
              filelist.findex.uri||'" into "'||filelist.n.uri||'"',1
          forgetattrlist = ''
          src = 'data:text/html;' /* <- we oughta refine this... */
          src = src||'base64,'||enbase64(filelist.findex.guts)
          call html_updattr 'src',src
          newcode = '<frame'
          call html_dumpattrs
          newcode = newcode||'>'
         end

        when ftag = 'iframe' then do
          if filelist.findex.stage < 2 then /* Debug code still here */
            call msg_out 'Warning - merging unparsed file: "'||,
              filelist.findex.uri||'" into "'||filelist.n.uri||'"',1
          forgetattrlist = ''
          src = 'data:text/html;' /* <- we oughta refine this... */
          src = src||'base64,'||enbase64(filelist.findex.guts)
          call html_updattr 'src',src
          newcode = '<iframe'
          call html_dumpattrs
          newcode = newcode||'>'
         end

        otherwise /* Should not be any other ftag types */
         nop /* and we gave a warning before anyway */
       end

      filelist.n.guts = substr(filelist.n.guts,1,offset-1)||newcode||substr(filelist.n.guts,offset+length)
     end

   end /* for every linked tag */
  filelist.n.stage = 2
return

/* Warning: 'shorthanded' subroutines - use only within htm_subst */

html_getattr:
  parse arg targetattrname
  do x=1 to fattrlist.0
    parse var fattrlist.x attrname attrvalue
    if attrname = targetattrname then
      return attrvalue
   end
return ''

html_updattr:
  parse arg targetattrname,newvalue
  do x=1 to fattrlist.0
    parse var fattrlist.x attrname attrvalue
    if attrname = targetattrname then do
      fattrlist.x = attrname||' '||newvalue
      return
     end
   end
return

html_dumpattrs:
  do x=1 to fattrlist.0
    parse var fattrlist.x attrname attrvalue
    if pos(attrname,forgetattrlist) = 0 then do
      if attrvalue = '' then
        newcode = newcode||' '||attrname
      else
        newcode = newcode||' '||attrname||'="'||attrvalue||'"'
     end
   end
return

/* Back to 'normal' functions */

css_prepare:
  parse arg n,phase
  if filelist.n.stage >= phase then
    return
  select
    when phase=1 then do
      call msg_out 'Parsing '||filelist.n.uri,0
      filelist.n.link.0 = 0
      call css_urlhunt filelist.n.guts,cssurls
      do k=1 to cssurls.0                  /* we're moving forward this time */
        parse var cssurls.k . . urltype actualurl
        if (urltype <> '@medimp') then do  /* @medimp -> dump */
          gotit = get_new_file(addurl(filelist.n.uri,actualurl),filelist.n.uri)
          if gotit <> 0 then do
            m = filelist.n.link.0 +1
            filelist.n.link.0 = m
            filelist.n.link.m = cssurls.k
            if urltype = '@dirimp' then
              filelist.gotit.ftyp = 'css'
           end
         end
       end
     end

    when phase=2 then do
      call msg_out 'Build CSS : '||filelist.n.uri,0
      do k=filelist.n.link.0 to 1 by -1
        parse var filelist.n.link.k offset length urltype actualurl
        gotit = getfileindex(addurl(filelist.n.uri,actualurl))
        if gotit = 0 then      /* Debug code actually */
          call msg_out 'Warning - file not found: "'||,
            actualurl||'" linked from "'||filelist.n.uri||'"',1
        else do
          if (filelist.gotit.ftyp = 'css' & filelist.gotit.stage < 2) then
            call msg_out 'Warning - merging unparsed file: "'||,
              actualurl||'" into "'||filelist.n.uri||'"',1
          select
            when urltype = '@dirimp' then
              filelist.n.guts = substr(filelist.n.guts,1,offset-1)||,
                filelist.gotit.guts||substr(filelist.n.guts,offset+length)
            when urltype = 'url' then do
              newcode = 'url("data:'||imagetype(gotit)||';'
              newcode = newcode||'base64,'||enbase64(filelist.gotit.guts)||'")'
              filelist.n.guts = substr(filelist.n.guts,1,offset-1)||,
                newcode||substr(filelist.n.guts,offset+length)
             end
            otherwise nop
           end
         end
       end
     end

    otherwise nop
   end

  filelist.n.stage = phase
return

/*
  This is NOT as clean and reusable as the HTML Tag Hunt function: we're
  looking for URIs in the appropriate places, we put them in a list and that's
  it. I also grew tired of coding ;-)

  Fortunately there are few types of URLs in CSS, so this function returns an
  usual REXX array with items in the format:

  offset length type url

  where offset is relative to the start of the css string, length is that of
  the whole CSS rule to be considered, type is either url, @dirimp or @medimp
  and url is the actual uri where a new file should be looked for

*/

css_urlhunt:
  parse arg cssdata,listname
  urlcnt = 0
  dlen = length(cssdata)
  curpos = 1
  do until (nextpos = 0 | curpos >= dlen )
    drop rules
    nextpos = pos('{',cssdata,curpos)
    stuff_sp = curpos
    if nextpos = 0 then
      stuff = substr(cssdata,curpos)
    else do
      stuff = substr(cssdata,curpos,nextpos-curpos)
      curpos = nextpos +1
      nextpos = pos('}',cssdata,curpos)
      rules_sp = curpos
      if nextpos = 0 then
        rules = substr(cssdata,curpos)
      else do
        rules = substr(cssdata,curpos,nextpos-curpos)
        curpos = nextpos +1
       end
     end
    /* There may be more rules or not, now analyze that! */
    stuff_cp = 1
    slen = length(stuff)
    do until (stuff_np = 0 | stuff_cp >= slen)
      stuff_np = pos('@IMPORT',translate(stuff),stuff_cp)
      if stuff_np > 0 then do
        stuff_cp = stuff_np
        stuff_np = pos(';',stuff,stuff_cp)
        if stuff_np > 0 then do
          urlcnt = urlcnt+1
          url_pos = stuff_sp+stuff_cp-1
          url_len = stuff_np-stuff_cp+1
          url = substr(cssdata,url_pos,url_len)
          url = translate(url,'  ','0D0A'x)
          parse var url . url
          if pos('(',url)>0 then
            parse var url . '(' url ')' cssmedia ';'
          else do
            delim = left(url,1)
            parse var url (delim) url (delim) cssmedia ';'
           end
          url = strip(url)
          url = strip(url,,'"')
          url = strip(url,,"'")
          cssmedia = strip(cssmedia)
          output = url_pos||' '||url_len
          if cssmedia = '' then
            output = output||' '||'@dirimp'
          else
            output = output||' '||'@medimp'
          call value value(listname).urlcnt,output||' '||url
          stuff_cp = stuff_np
         end
       end
     end
    if rules <> 'RULES' then do
      rules_cp = 1
      rlen = length(rules)
      do until (rules_np = 0 | rules_cp >= rlen)
        rules_np = pos('URL(',translate(rules),rules_cp)
        if rules_np > 0 then do
          rules_cp = rules_np
          rules_np = pos(')',rules,rules_cp)
          if rules_np > 0 then do
            urlcnt = urlcnt+1
            url_pos = rules_sp+rules_cp-1
            url_len = rules_np-rules_cp+1
            url = substr(cssdata,url_pos,url_len)
            parse var url . '(' url ')'
            url = translate(url,'  ','0D0A'x)
            url = strip(url)
             /* '"' or "'" should not be inside URIs anyway */
             /* and we're preventing non-closed URIs */
            url = strip(url,,'"')
            url = strip(url,,"'")
            call value value(listname).urlcnt,url_pos||' '||url_len||' url '||url
            rules_cp = rules_np
           end
         end
       end
     end
   end
  call value value(listname).0,urlcnt
return

getfileindex: procedure expose filelist.
  parse arg targeturi
  parse var targeturi targeturi '#' . /* "#" only separates document fragments */
  do i=1 to filelist.0
    if targeturi = filelist.i.uri then
     return i
   end
  return 0
return

get_new_file: procedure expose filelist. bad_uris.
  parse arg file,referer
  parse var file file '#' .
  do i=1 to bad_uris.0
    if bad_uris.i = file then do
      call msg_out 'Skipping bad URI: '||file,0
      return 0
     end
   end
  do i=1 to filelist.0
    if filelist.i.uri = file then do
      call msg_out 'Already read: '||file,0
      return i
     end
   end
  i = 0
  data = getfilecontents(file,referer)
  if length(data) > 0 then do
    i = filelist.0 +1
    filelist.0 = i
    filelist.i.uri  = file
    filelist.i.guts = data
    filelist.i.stage= 0
   end
  else do
    j = bad_uris.0 +1
    bad_uris.0 = j
    bad_uris.j = file
   end
return i

imagetype: procedure expose filelist.
  parse arg n
  select
    when left(filelist.n.guts,4) = 'PNG' then
      return 'image/png'
    when left(filelist.n.guts,3) = 'GIF' then
      return 'image/gif'
    when substr(filelist.n.guts,1,2) = 'FFD8'x then
      return 'image/jpeg'
    otherwise nop
   end
return 'image'

addurl: procedure
  parse arg baseuri,rel_uri
  rel_uri = subst(rel_uri,'&amp;','&',0)
  if (pos('://',rel_uri)>0 | pos(':\',rel_uri)>0)then
    return rel_uri
  /* Some DOS-related stupidity fixes */
  if pos('\',rel_uri) > 0 then
    rel_uri = subst(rel_uri,'\','/')
  if pos('\',baseuri)>0 then
    ds = '\'
  else
    ds = '/'
  parse var baseuri scheme '://' host '/' path '?' query '#' anchor
  select
    /* Disguised absolute URIs, found in IBM's pages  */
    when left(rel_uri,2) = '//' then
      res = scheme||':'||rel_uri
    when left(rel_uri,1) = '/' then
      res = scheme||'://'||host||rel_uri
    otherwise
      /* Now the common part */
      base = substr(baseuri,1,lastpos(ds,baseuri))
      if left(rel_uri,2)='./' then
        rel_uri = substr(rel_uri,3)
      do while left(rel_uri,3) = '../'
        parse var rel_uri '../' rel_uri
        base = substr(base,1,length(base)-1)
        base = substr(base,1,lastpos(ds,base))
       end
      res = subst(base||rel_uri,'/',ds)
   end
return res

output: procedure expose dir_sep output
  parse arg origin,data
  if output = 'cgi' then do
    /* we should investigate the headers, but ... */
    call lineout stdout,'Content-type: text/html;'
    call lineout stdout,''
    call charout stdout,data
   end
  else do
    /* make up a good output filename and dump the data */
    if left(translate(origin),7) = 'HTTP://' then do
      /* probably we'll change the URI -> local name conv scheme later */
      parse var origin . '://' target '?' . 
      if right(target,1) = '/' then
        target = target||'index.htm'
      target = subst(target,'/','_')
     end
    else do
      target = substr(origin,lastpos(dir_sep,origin)+1)
      select
        when right(target,1) = 'm' then
          target = target||'l'
        when right(target,1) = 'l' then
          target = substr(target,1,length(target)-1)
        otherwise nop
       end
     end
    alreadyexists = stream(target,'C','QUERY EXISTS')
    if alreadyexists <> '' then
      target = SysTempFileName('UniPage???.htm');
    call msg_out 'Saving file '||target||', '||length(data)||' bytes.',0
    rc = charout(target,data)
    if rc <> 0 then
      call msg_out 'Write error! ',1
    rc = stream(target,'C','CLOSE')
    if rc <> 'READY:' then
      call msg_out 'Close file error! ',1
   end
return

bomb_out:
  parse arg code
  if output='cgi' then do
    call lineout stdout,'Content-type: text/plain; charset=UTF-8'
    call lineout stdout,''
    call charout stdout,cgi_output
   end
  exit code
return

msg_out:
  parse arg msg,msgtype
  if output = 'cgi' then do
    cgi_output = cgi_output||'0D0A'x||msg
    return
   end
  select
    when msgtype = 0 then do
      call lineout stdout,msg
     end
    when msgtype = 1 then do
      call lineout stderr,msg
     end
    otherwise nop
   end
return

getfilecontents: procedure
  parse arg target,referer
  data = ''
  /* We'll think how to get remote files later */
  if pos('://',target) > 0 then do
    parse var target scheme '://' .
    if translate(scheme) = 'FILE' then
      leave
    KillTarget = 1
    url = target
    parse var target target '?' .
    if right(target,1) = '/' then
      target = 'index.htm'
    else
      target = substr(target,lastpos('/',target)+1)
    alreadyexists = stream(target,'C','QUERY EXISTS')
    if alreadyexists <> '' then
      target = SysTempFileName('UniPage?.???');
    call msg_out 'Retrieving: "'||url||'"',0
    'wget -q -O '||target||' -U "'||UserAgent||'" --referer="'||referer||'" "'||url||'"'
    if rc = 1  then do
      call msg_out 'Error: "'url'" could not be retrieved.',1
      return data
     end
   end
  /* then at last we're chasing a local file no matter what */
  data = charin(target,1,chars(target))
  call stream target,'C','CLOSE'
  if length(data) = 0 then
    call msg_out 'Error: could not get file contents for "'||target||'".',1
  else
    call msg_out '+File read: '||target||', '||length(data)||' bytes.',0
  if KillTarget = 1 then
    'del "'||target||'" >NUL'
return data

/*
   htmltaghunt v1.1 2006/12/22

   Function that goes through an HTML string with tags and text and builds an
   array with data from the tags that match ANY of the specified conditions
   Parameters:
   1) HTML String
   2) Name of the array to put the data into
   3) List of tags to search for, in the special format:
    'tagspec_1 tagspec_2 ... tagspec_n' where any tagspec can be:
    - a simple "tag", i.e. 'table', or even '/table'
    - a tag that must have a certain attribute to be matched, in the format
     'tag.attribute'
    - a tag with an attribute that needs to match a certain value, in the
     format 'tag.attribute=value'
    - a tag with an attribute that needs to be different from a certain value,
     in the format 'tag.attribute!value'
    - a tag with an attribute that needs to contain a certain substring, in the
     format 'tag.attribute>substring'
    - a tag with an attribute that needs not to contain a certain substring, in
     the format 'tag.attribute<substring'

   Remarks:
   1) Tag and attribute names to search for are matched insensitively to ensure
      a maximum number of matches
   2) When a substring is searched for within an attribute, search is performed
      case-insensitively, to ensure a maximum number of matches
   3) When a substring is NOT wanted within an attribute, search is performed
      case-sensitively, to ensure a minimum number of matches
   There's a call to translate(...) 'everywhere where appropriate' in the code
   below...
   4) Attribute substrings specified in the 'matches' list cannot contain
      spaces.
   5) Anything between an innermost '<' and '>' pair is considered a tag i.e.
    '<tag blah blah >' inside '<!-- <tag blah blah > -->' IS a tag
    I know that this is inside a comment, but you'll have to live with it.
    This allows for CC stuff and closing tags like '/table' to be processed as
    well without any more code.

   A poor man's DOM? You bet it!

*/

htmltaghunt:
  parse arg htmldata,listname,targtags
  TagTerm = '0D0A'x||' <>'
  dlen    = length(htmldata)
  mycnt = 0
  do while length(targtags)>0
    parse var targtags newtag targtags
    mycnt = mycnt+1
    parse var newtag tag.mycnt '.' rest
    if rest = '' then
      attr.mycnt = ''
    else
      select
        when pos('=',rest)>0 then do
          parse var rest attr.mycnt '=' compval.mycnt
          comptype.mycnt = '='
         end
        when pos('!',rest)>0 then do
          parse var rest attr.mycnt '!' compval.mycnt
          comptype.mycnt = '!'
         end
        when pos('>',rest)>0 then do
          parse var rest attr.mycnt '>' compval.mycnt
          comptype.mycnt = '>'
         end
        when pos('<',rest)>0 then do
          parse var rest attr.mycnt '<' compval.mycnt
          comptype.mycnt = '<'
         end
        otherwise
          attr.mycnt = rest
          compval.mycnt = ''
          comptype.mycnt = '-'
       end
    tag.mycnt  = translate(tag.mycnt,xrange('a','z'),xrange('A','Z'))
    attr.mycnt = translate(attr.mycnt,xrange('a','z'),xrange('A','Z'))
   end
  tag.0 = mycnt
  curpos  = 1
  nextpos = 1
  tagcnt = 0
  do until (nextpos = 0 | curpos >= dlen )
    nextpos = pos('<',htmldata,curpos)
    tag = ''
    if nextpos > 0 then do
      curpos   = nextpos
      nextchar = ''
      do while ((pos(nextchar,TagTerm)=0) & (curpos < dlen))
        nextchar = substr(htmldata,curpos+1,1)
        curpos = curpos +1
        if pos(nextchar,TagTerm)=0 then
          tag = tag||nextchar
       end
      tag = translate(tag,xrange('a','z'),xrange('A','Z'))
      if nextchar <> '<' then do   /* "<whatever<..." things are out now */
        closepos = pos('>',htmldata,curpos)
        if closepos > 0 then do
          do mycnt=1 to tag.0
            if (tag.mycnt = tag) | (tag.mycnt = '*') then do
              if attr.mycnt = '' then do /* only tag to match is specified */
                tagcnt = tagcnt +1
                call value value(listname).tagcnt,nextpos||' '||closepos-nextpos+1||' '||tag
               end
              else do
                call htmlattrlist substr(htmldata,nextpos,closepos-nextpos+1),testattr
                match = 0
                do my2=1 to testattr.0
                  parse var testattr.my2 testattr.my2 attrval
                  if attr.mycnt = testattr.my2 then do
                    select
                      when comptype.mycnt = '-' then
                        match = 1
                      when comptype.mycnt = '=' then
                        match = (compval.mycnt = attrval)
                      when comptype.mycnt = '!' then
                        match = (compval.mycnt <> attrval)
                      when comptype.mycnt = '>' then
                        match = (pos(translate(compval.mycnt),translate(attrval))>0)
                      when comptype.mycnt = '<' then
                        match = (pos(compval.mycnt,attrval)=0)
                      otherwise nop
                     end
                   end
                  if match = 1 then do
                    tagcnt = tagcnt +1
                    call value value(listname).tagcnt,nextpos||' '||closepos-nextpos+1||' '||tag
                    leave /* leave my2 */
                   end
                 end
               end
             end
           end
         end /* if closepos > 0*/
       end /* if nextchar <> '<' */
     end /* if nextpos > 0 */
   end /* until nextpos = 0 | curpos >= dlen */
  call value value(listname).0,tagcnt
return

/*
   htmlattrlist v1.0 2006/12/22

   Function that goes through an HTML tag string and builds an array with
   attibute + space + value strings, thus saving the need to further search
   for equal signs, quotes and the like - purportedly at least!

   Parameters:
   1) tag string (the text delimited by '<' and '>', included)
   2) Name of the array to put the data into

   Remarks:
   1) In the output array, attribute names are converted to lowercase but of
     course NOT attribute values
   2) Output array components have the form 'attrname[ attrvalue]' i.e. if
     an attribute has ANY value (even ''), it is appended to the corresponding
     array element along with a blank space.
     This allows to distinguish 'contracted' attribute from modern-style ones.
*/

htmlattrlist:
  parse arg htmlstring,list
  attrcnt = 0
  if (left(htmlstring,1) <> '<' | right(htmlstring,1) <> '>' ) then do
    call value value(list).0,attrcnt
    return 1
   end
  htmlstring = substr(htmlstring,2,length(htmlstring)-2)
  htmlstring = translate(htmlstring,'  ','0D0A'x)
  parse var htmlstring . htmlstring   /* the tag is always leading */
  htmlstring = strip(htmlstring)
  do while length(htmlstring)>0
    /* Any '=' inside an attr value should be between quotes, so... */
    parse var htmlstring curattr '=' htmlstring
    do while pos(' ',curattr)>0
      parse var curattr preattr curattr
      curattr = strip(curattr)
      attrcnt = attrcnt +1
      preattr = translate(preattr,xrange('a','z'),xrange('A','Z'))
      call value value(list).attrcnt,preattr
     end
    attrcnt = attrcnt +1
    curattr = translate(curattr,xrange('a','z'),xrange('A','Z'))
    newattr = curattr
    htmlstring = strip(htmlstring)
    if htmlstring <> '' then do
      select
        when left(htmlstring,1)='"' then
          valterm = '"'
        when left(htmlstring,1)="'" then
          valterm = "'"
        otherwise
          valterm = " "
       end
      endval = pos(valterm,htmlstring,2)
      if endval > 0 then do
        curval = substr(htmlstring,1,endval)
        htmlstring = substr(htmlstring,endval+1)
        htmlstring = strip(htmlstring)
       end
      else do
        curval = htmlstring
        htmlstring = ''
       end
      curval = strip(curval,,'"')
      curval = strip(curval,,"'")
      newattr = curattr||' '||curval
     end
    call value value(list).attrcnt,newattr
   end
  call value value(list).0,attrcnt
return 0

/*
   subst v1.1 2006/12/19

   Funcin que sustituye una cadena dentro de una ms grande por otra cadena;
   Parmetros:
   1) Cadena que contiene lo que queremos sustituir
   2) Cadena que queremos sustituir
   3) Por lo que queremos sustituilla
   4) Cuntas apariciones queremos sustituir (0=todas), empezando por el
     principio

   Observaciones:
   1) En caso de no querer hacer sustituciones, bastara con NO llamar a la
     funcin, de ah que se pueda especificar n=0 para sustituir todas las
     apariciones
*/

subst: procedure
  parse arg string,searchthis,replacement,howmany
  if howmany = '' then howmany = 0
  len = length(searchthis)
  changes = 0
  ready = ''
  do until (loc = 0) | changes=howmany
    loc = pos(searchthis,string)
    if loc > 0 then do
      ready = ready || substr(string,1,loc-1) || replacement
      string = substr(string,loc+len)
      howmany=howmany+1
     end
   end
return ready || string

/*  */

enbase64: procedure
  parse arg data
  base64 = ''
  n = length(data) % 3
  tail_l = length(data)//3
  do i=1 to n
    buffer = substr(data,3*i-2,3)
    base64 = base64||bits826(buffer)
   end
  if tail_l >0 then do
    tail = right(data,tail_l)||copies('00'x,3-tail_l)
    tail = left(bits826(tail),tail_l+1)||copies('=',3-tail_l)
    base64 = base64||tail
   end
return base64

bits826: procedure
  charset = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
  parse arg data /* Must be a 3-byte string */
  index = x2b(c2x(data))
  out = ''
  do j=1 to 4
    charidx = substr(index,6*j-5,6)
    charidx = x2d(b2x(charidx)) +1
    out = out||substr(charset,charidx,1)
   end
return out
