Module:Webarchive

From Dharmawiki
Jump to navigation Jump to search

--[[ ----------------------------------

    Lua module implementing the Error in webarchive template: Check |url= value. Empty. template. 
      A merger of the functionality of three templates: Error in wayback template: Check |url= value. Empty.‹The template Wayback is being considered for merging.› , Template:Webcite and Template:Cite archives
  
 ]]

local p = {}

--[[--------------------------< inlineError >-----------------------

    Critical error. Render output completely in red. Add to tracking category.
]]

local function inlineError(arg, msg)

 track["Category:Webarchive template errors"] = 1
 return 'Error in webarchive template: Check |' .. arg .. '= value. ' .. msg .. ''

end

--[[--------------------------< inlineRed >-----------------------

     Render a text fragment in red, such as a warning as part of the final output.
     Add tracking category.
]]

local function inlineRed(msg, trackmsg)

 if trackmsg == "warning" then
   track["Category:Webarchive template warnings"] = 1 
 elseif trackmsg == "error" then
   track["Category:Webarchive template errors"] = 1 
 end
 return '' .. msg .. ''

end

--[[--------------------------< trimArg >-----------------------

    trimArg returns nil if arg is "" while trimArg2 returns 'true' if arg is "" 
    trimArg2 is for args that might accept an empty value, as an on/off switch like nolink=
]]

local function trimArg(arg)

 if arg == "" or arg == nil then
   return nil
 else
   return mw.text.trim(arg)
 end

end local function trimArg2(arg)

 if arg == nil then
   return nil
 else
   return mw.text.trim(arg)
 end

end

--[[--------------------------< base62 >-----------------------

    Convert base-62 to base-10
    Credit: https://de.wikipedia.org/wiki/Modul:Expr 
 ]]

local function base62( value )

   local r = 1
   if value:match( "^%w+$" ) then
       local n = #value
       local k = 1
       local c
       r = 0
       for i = n, 1, -1 do
           c = value:byte( i, i )
           if c >= 48  and  c <= 57 then
               c = c - 48
           elseif c >= 65  and  c <= 90 then
               c = c - 55
           elseif c >= 97  and  c <= 122 then
               c = c - 61
           else    -- How comes?
               r = 1
               break    -- for i
           end
           r = r + c * k
           k = k * 62
       end -- for i
   end
   return r

end

--[[--------------------------< tableLength >-----------------------

     Given a 1-D table, return number of elements
 ]]

local function tableLength(T)

 local count = 0
 for _ in pairs(T) do count = count + 1 end
 return count

end


--[[--------------------------< dateFormat >-----------------------

    Given a date string, return its format: dmy, mdy, iso, ymd
      If unable to determine return nil
 ]]

local function dateFormat(date)

 local dt = {}
 dt.split = {}
 dt.split = mw.text.split(date, "-")
 if tableLength(dt.split) == 3 then
   if tonumber(dt.split[1]) > 1900 and tonumber(dt.split[1]) < 2200 and tonumber(dt.split[2]) and tonumber(dt.split[3]) then
     return "iso"
   else
     return nil
   end
 end  
 dt.split = mw.text.split(date, " ")
 if tableLength(dt.split) == 3 then
   if tonumber(dt.split[3]) then
     if tonumber(dt.split[3]) > 1900 and tonumber(dt.split[3]) < 2200 then
       if tonumber(dt.split[1]) then
         return "dmy"
       else
         return "mdy"
       end 
     else
       if tonumber(dt.split[1]) then
         if tonumber(dt.split[1]) > 1900 and tonumber(dt.split[1]) < 2200 then
           return "ymd"
         end
       end
     end
   end
 end
 return nil

end

--[[--------------------------< makeDate >-----------------------

    Given a zero-padded 4-digit year, 2-digit month and 2-digit day, return a full date in df format
    df = mdy, dmy, iso, ymd
]]

local function makeDate(year, month, day, df)

 if not year or year == "" or not month or month == "" or not day or day == "" then
   return nil
 end
 local zmonth = month                                                      -- month with leading 0
 month = month:match("0*(%d+)")                                            -- month without leading 0
 if tonumber(month) < 1 or tonumber(month) > 12 then
   return year
 end
 local nmonth = os.date("%B", os.time{year=2000, month=month, day=1} )     -- month in name form       
 if not nmonth then
   return year
 end
 local zday = day
 day = zday:match("0*(%d+)")
 if tonumber(day) < 1 or tonumber(day) > 31 then
   if df == "mdy" or df == "dmy" then
     return nmonth .. " " .. year
   elseif df == "iso" then
     return year .. "-" .. zmonth 
   elseif df == "ymd" then
     return year .. " " .. nmonth
   else
     return nmonth .. " " .. year
   end
 end                                       
 if df == "mdy" then
   return nmonth .. " " .. day .. ", " .. year         -- September 1, 2016
 elseif df == "dmy" then
   return day .. " " .. nmonth .. " " .. year          -- 1 September 2016
 elseif df == "iso" then
   return year .. "-" .. zmonth .. "-" .. zday         -- 2016-09-01
 elseif df == "ymd" then
   return year .. " " .. nmonth .. " " .. cday          -- 2016 September 1
 else
   return nmonth .. " " .. day .. ", " .. year         -- September 1, 2016
 end

end


--[[--------------------------< decodeWebciteDate >-----------------------

     Given a URI-path to Webcite (eg. /67xHmVFWP) return the encoded date in df format
 ]]

local function decodeWebciteDate(path, df)

   local dt = {}
   dt.split = {}
   dt.split = mw.text.split(path, "/")
   -- valid URL formats that are not base62
   -- http://www.webcitation.org/query?id=1138911916587475
   -- http://www.webcitation.org/query?url=http..&date=2012-06-01+21:40:03
   -- http://www.webcitation.org/1138911916587475
   -- http://www.webcitation.org/cache/73e53dd1f16cf8c5da298418d2a6e452870cf50e
   -- http://www.webcitation.org/getfile.php?fileid=1c46e791d68e89e12d0c2532cc3cf629b8bc8c8e
   if mw.ustring.find( dt.split[2], "query", 1, plain) or 
      mw.ustring.find( dt.split[2], "cache", 1, plain) or
      mw.ustring.find( dt.split[2], "getfile", 1, plain) or
      tonumber(dt.split[2]) then
     return "query"
   end
   dt.full = os.date("%Y %m %d", string.sub(string.format("%d", base62(dt.split[2])),1,10) )
   dt.split = mw.text.split(dt.full, " ")
   dt.year = dt.split[1]
   dt.month = dt.split[2]
   dt.day = dt.split[3]
   if not tonumber(dt.year) or not tonumber(dt.month) or not tonumber(dt.day) then
     return inlineRed("[Date error] (1)", "error")
   end
   if tonumber(dt.month) > 12 or tonumber(dt.day) > 31 or tonumber(dt.month) < 1 then
     return inlineRed("[Date error] (2)", "error")
   end
   if tonumber(dt.year) > tonumber(os.date("%Y")) or tonumber(dt.year) < 1900 then
     return inlineRed("[Date error] (3)", "error")
   end
   fulldate = makeDate(dt.year, dt.month, dt.day, df)
   if not fulldate then
     return inlineRed("[Date error] (4)", "error")
   else
     return fulldate
   end

end

--[[--------------------------< snapDateToString >-----------------------

Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com )

 return the formatted date eg. "September 1, 2016" in df format 
 Handle non-digits in snapshot ID such as "re_" and "-" and "*"
]]

local function decodeWaybackDate(path, df)

   local snapdate, snapdatelong, currdate, fulldate
   local safe = path
   snapdate = string.gsub(safe, "^/w?e?b?/?", "")                      -- Remove leading "/web/" or "/"
   safe = snapdate
   local N = mw.text.split(safe, "/")
   snapdate = N[1]
   if snapdate == "*" then                                             -- eg. /web/*/http..
     return "index"
   end
   safe = snapdate
   snapdate = string.gsub(safe, "[a-z][a-z]_[0-9]?$", "")              -- Remove any trailing "re_" from date 
   safe = snapdate
   snapdate = string.gsub(safe, "[-]", "")                             -- Remove dashes from date eg. 2015-01-01 
   safe = snapdate
   snapdate = string.gsub(safe, "[*]$", "")                            -- Remove trailing "*" 
   if not tonumber(snapdate) then
     return inlineRed("[Date error] (2)", "error")
   end
   local dlen = string.len(snapdate)
   if dlen < 4 then
     return inlineRed("[Date error] (3)", "error")
   end
   if dlen < 14 then
     snapdatelong = snapdate .. string.rep("0", 14 - dlen)
   else
     snapdatelong = snapdate
   end
   local year = string.sub(snapdatelong, 1, 4)
   local month = string.sub(snapdatelong, 5, 6)
   local day = string.sub(snapdatelong, 7, 8)
   if not tonumber(year) or not tonumber(month) or not tonumber(day) then
     return inlineRed("[Date error] (4)", "error")
   end
   if tonumber(month) > 12 or tonumber(day) > 31 or tonumber(month) < 1 then
     return inlineRed("[Date error] (5)", "error")
   end
   currdate = os.date("%Y")
   if tonumber(year) > tonumber(currdate) or tonumber(year) < 1900 then
     return inlineRed("[Date error] (6)", "error")
   end
   fulldate = makeDate(year, month, day, df)
   if not fulldate then
     return inlineRed("[Date error] (7)", "error")
   else
     return fulldate
   end

end


--[[--------------------------< serviceName >-----------------------

    Given a domain extracted by mw.uri.new() (eg. web.archive.org) set tail string and service ID
 ]]

local function serviceName(host, nolink)

 local tracking = "Category:Webarchive template other archives"
 local bracketopen = "[["
 local bracketclose = "]]"
 if nolink then
   bracketopen = ""
   bracketclose = ""
 end
 ulx.url1.service = "other"
 ulx.url1.tail = " at " .. ulx.url1.host .. " " .. inlineRed("Error: unknown archive URL")
 if mw.ustring.find( host, "archive.org", 1, plain ) then
   ulx.url1.service = "wayback"
   ulx.url1.tail = " at the " .. bracketopen .. "Wayback Machine" .. bracketclose
   tracking = "Category:Webarchive template wayback links"
 elseif mw.ustring.find( host, "webcitation.org", 1, plain ) then
   ulx.url1.service = "webcite"
   ulx.url1.tail = " at " .. bracketopen .. "WebCite" .. bracketclose
   tracking = "Category:Webarchive template webcite links"
 elseif mw.ustring.find( host, "archive.is", 1, plain ) then
   ulx.url1.service = "archiveis"
   ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose
   tracking = "Category:Webarchive template archiveis links"
 elseif mw.ustring.find( host, "archive.fo", 1, plain ) then
   ulx.url1.service = "archiveis"
   ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose
   tracking = "Category:Webarchive template archiveis links"
 elseif mw.ustring.find( host, "archive.today", 1, plain ) then
   ulx.url1.service = "archiveis"
   ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose
   tracking = "Category:Webarchive template archiveis links"
 elseif mw.ustring.find( host, "archive.il", 1, plain ) then
   ulx.url1.service = "archiveis"
   ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose
   tracking = "Category:Webarchive template archiveis links"
 elseif mw.ustring.find( host, "archive.ec", 1, plain ) then
   ulx.url1.service = "archiveis"
   ulx.url1.tail = " at " .. bracketopen .. "Archive.is" .. bracketclose
   tracking = "Category:Webarchive template archiveis links"
 elseif mw.ustring.find( host, "archive[-]it.org", 1, plain ) then
   ulx.url1.service = "archiveit"
   ulx.url1.tail = " at " .. bracketopen .. "Archive-It" .. bracketclose
 elseif mw.ustring.find( host, "arquivo.pt", 1, plain) then
   ulx.url1.tail = " at the " .. "Portuguese Web Archive" 
 elseif mw.ustring.find( host, "loc.gov", 1, plain ) then
   ulx.url1.tail = " at the " .. bracketopen .. "Library of Congress" .. bracketclose
 elseif mw.ustring.find( host, "webharvest.gov", 1, plain ) then
   ulx.url1.tail = " at the " .. bracketopen .. "National Archives and Records Administration" .. bracketclose
 elseif mw.ustring.find( host, "bibalex.org", 1, plain ) then
   ulx.url1.tail = " at " .. "Bibliotheca Alexandrina"
 elseif mw.ustring.find( host, "collectionscanada", 1, plain ) then
   ulx.url1.tail = " at the " .. "Canadian Government Web Archive"
 elseif mw.ustring.find( host, "haw.nsk", 1, plain ) then
   ulx.url1.tail = " at the " .. "Croatian Web Archive (HAW)"
 elseif mw.ustring.find( host, "veebiarhiiv.digar.ee", 1, plain ) then
   ulx.url1.tail = " at the " .. "Estonian Web Archive"
 elseif mw.ustring.find( host, "vefsafn.is", 1, plain ) then
   ulx.url1.tail = " at the " .. "National and University Library of Iceland"
 elseif mw.ustring.find( host, "proni.gov", 1, plain ) then
   ulx.url1.tail = " at the " .. bracketopen .. "Public Record Office of Northern Ireland" .. bracketclose
 elseif mw.ustring.find( host, "uni[-]lj.si", 1, plain ) then
   ulx.url1.tail = " at the " .. "Slovenian Web Archive"
 elseif mw.ustring.find( host, "stanford.edu", 1, plain ) then
   ulx.url1.tail = " at the " .. "Stanford Web Archive"
 elseif mw.ustring.find( host, "nationalarchives.gov.uk", 1, plain ) then
   ulx.url1.tail = " at the " .. bracketopen .. "UK Government Web Archive" .. bracketclose
 elseif mw.ustring.find( host, "parliament.uk", 1, plain ) then
   ulx.url1.tail = " at the " .. bracketopen .. "UK Parliament's Web Archive" .. bracketclose
 elseif mw.ustring.find( host, "webarchive.org.uk", 1, plain ) then
   ulx.url1.tail = " at the " .. bracketopen .. "UK Web Archive" .. bracketclose
 elseif mw.ustring.find( host, "nlb.gov.sg", 1, plain ) then
   ulx.url1.tail = " at " .. "Web Archive Singapore" 
 elseif mw.ustring.find( host, "pandora.nla.gov.au", 1, plain ) then
   ulx.url1.tail = " at " .. bracketopen .. "Pandora Archive" .. bracketclose 
 elseif mw.ustring.find( host, "perma.cc", 1, plain ) then
   ulx.url1.tail = " at " .. bracketopen .. "Perma.cc" .. bracketclose
 elseif mw.ustring.find( host, "perma-archives.cc", 1, plain ) then
   ulx.url1.tail = " at " .. bracketopen .. "Perma.cc" .. bracketclose
 elseif mw.ustring.find( host, "screenshots.com", 1, plain ) then
   ulx.url1.tail = " at Screenshots" 
 elseif mw.ustring.find( host, "wikiwix.com", 1, plain ) then
   ulx.url1.tail = " at Wikiwix" 
 elseif mw.ustring.find( host, "freezepage.com", 1, plain ) then
   ulx.url1.tail = " at Freezepage" 
 elseif mw.ustring.find( host, "webcache.googleusercontent.com", 1, plain ) then
   ulx.url1.tail = " at Google Cache" 
 else
   tracking = "Category:Webarchive template unknown archives"
 end
 track[tracking] = 1

end

--[[--------------------------< parseExtraArgs >-----------------------

    Parse numbered arguments starting at 2, such as url2..url10, date2..date10, title2..title10
      For example: Lua error at line 342: attempt to concatenate field 'host' (a nil value).
        Three url arguments not in numeric sequence (1..4..7). 
        Function only processes arguments numbered 2 or greater (in this case 4 and 7)
        It creates numeric sequenced table entries like:
          urlx.url2.url = <argument value for url4>
          urlx.url3.url = <argument value for url7>
      Returns the number of URL arguments found numbered 2 or greater (in this case returns "2")
]]

local function parseExtraArgs()

 local i, j, argurl, argurl2, argdate, argtitle
 j = 2
 for i = 2, maxurls do
   argurl = "url" .. i
   if trimArg(args[argurl]) then
     argurl2 = "url" .. j
     ulx[argurl2] = {}
     ulx[argurl2]["url"] = args[argurl]
     argdate = "date" .. j
     if trimArg(args[argdate]) then
       ulx[argurl2]["date"] = args[argdate]
     else
       ulx[argurl2]["date"] = inlineRed("[Date missing]", "warning")
     end
     argtitle = "title" .. j
     if trimArg(args[argtitle]) then
       ulx[argurl2]["title"] = args[argtitle]
     else
       ulx[argurl2]["title"] = nil
     end
     j = j + 1
   end
 end
 if j == 2 then
   return 0
 else
   return j - 2
 end

end

--[[--------------------------< comma >-----------------------

    Given a date string, return "," if it's MDY 
 ]]

local function comma(date)

 local N = mw.text.split(date, " ")
 local O = mw.text.split(N[1], "-") -- for ISO
 if O[1] == "index" then return "" end
 if not tonumber(O[1]) then
   return ","
 else
   return ""
 end

end

--[[--------------------------< createTracking >-----------------------

    Return data in track[] ie. tracking categories
 ]]

local function createTracking()

 local sand = ""
 if tableLength(track) > 0 then                        
   for key,_ in pairs(track) do
     sand = sand .. "" .. key .. ""
   end
 end
 return sand

end

--[[--------------------------< createRendering >-----------------------

    Return a rendering of the data in ulx[][]
 ]]

local function createRendering()

   local sand, displayheader, displayfield
   local period1 = ""   -- For backwards compat with Error in wayback template: Check |url= value. Empty.‹The template Wayback is being considered for merging.› 
   local period2 = "."                                                            
 
   local indexstr = "archived"
   if ulx.url1.date == "index" then
     indexstr = "archive"
   end  
                                                                                         -- For Error in wayback template: Check |url= value. Empty.‹The template Wayback is being considered for merging.› , Template:Webcite
   if ulx.url1.format == "none" then                                                     
     if not ulx.url1.title and not ulx.url1.date then                                    -- No title. No date
       sand = "[" .. ulx.url1.url .. " Archived]" .. ulx.url1.tail
     elseif not ulx.url1.title and ulx.url1.date then                                    -- No title. Date.
       if ulx.url1.service == "wayback" then 
         period1 = "."
         period2 = "" 
       end
       sand = "[" .. ulx.url1.url .. " Archived] " .. ulx.url1.date .. comma(ulx.url1.date) .. ulx.url1.tail .. period1
     elseif ulx.url1.title and not ulx.url1.date then                                    -- Title. No date.
       sand = "[" .. ulx.url1.url .. " " .. ulx.url1.title .. "]" .. ulx.url1.tail
     elseif ulx.url1.title and ulx.url1.date then                                        -- Title. Date.
       sand = "[" .. ulx.url1.url .. " " .. ulx.url1.title .. "]" .. ulx.url1.tail .. " (" .. indexstr .. " " .. ulx.url1.date .. ")"
     else
       return nil
     end
     if ulx.url1.extraurls > 0 then                                                      -- For multiple archive URLs
       local tot = ulx.url1.extraurls + 1
       sand = sand .. period2 .. " Additional archives: "
       for i=2,tot do
         local indx = "url" .. i
         if ulx[indx]["title"] then 
           displayfield = "title"
         else
           displayfield = "date"
         end
         sand = sand .. "[" .. ulx[indx]["url"] .. " " .. ulx[indx][displayfield] .. "]"
         if i == tot then
           sand = sand .. "."
         else
           sand = sand .. ", "
         end
       end
     else
       return sand  
     end
     return sand
                                                                                         -- For Template:Cite archives
   else                                                                  
     if ulx.url1.format == "addlarchives" then                           -- Multiple archive services 
       displayheader = "Additional archives: "
     else                                                                -- Multiple pages from the same archive 
       displayheader = "Additional pages archived on " .. ulx.url1.date .. ": "
     end
     local tot = 1 + ulx.url1.extraurls
     local sand = displayheader
     for i=1,tot do
       local indx = "url" .. i
       displayfield = ulx[indx]["title"]
       if ulx.url1.format == "addlarchives" then
         if not displayfield then 
           displayfield = ulx[indx]["date"]
         end
       else
         if not displayfield then 
           displayfield = "Page " .. i
         end
       end
       sand = sand .. "[" .. ulx[indx]["url"] .. " " .. displayfield .. "]"
       if i == tot then
         sand = sand .. "."
       else
         sand = sand .. ", "
       end
     end
     return sand
   end

end

function p.webarchive(frame)

 args = frame.args
 if (args[1]==nil) and (args["url"]==nil) then           -- if no argument provided than check parent template/module args
   args = frame:getParent().args 
 end

 local tname = "Webarchive"                              -- name of calling template. Change if template rename.
 ulx = {}                                                -- Associative array to hold template data 
 track = {}                                              -- Associative array to hold tracking categories
 maxurls = 10                                            -- Max number of URLs allowed. 
 local verifydates = "yes"                               -- See documentation. Set "no" to disable.
                                                         -- URL argument (first)
 local url1 = trimArg(args.url) or trimArg(args.url1)           
 if not url1 then
   return inlineError("url", "Empty.") .. createTracking()
 end
 if mw.ustring.find( url1, "https://web.http", 1, plain ) then    -- track bug 
   track["Category:Webarchive template errors"] = 1 
   return inlineError("url", "https://web.http") .. createTracking()
 end 
 if url1 == "https://web.archive.org/http:/" then                 -- track bug
   track["Category:Webarchive template errors"] = 1 
   return inlineError("url", "Invalid URL") .. createTracking()
 end
 ulx.url1 = {}
 ulx.url1.url = url1
 local uri1 = mw.uri.new(ulx.url1.url)
 ulx.url1.host = uri1.host
 ulx.url1.extraurls = parseExtraArgs()
                                                         -- Nolink argument 
 local nolink = trimArg2(args.nolink)
 serviceName(uri1.host, nolink)
                                                         -- Date argument
 local date = trimArg(args.date) or trimArg(args.date1)
 if date == "*" and ulx.url1.service == "wayback" then
   date = "index"
 elseif date and ulx.url1.service == "wayback" and verifydates == "yes" then 
   local ldf = dateFormat(date)
   if ldf then
     local udate = decodeWaybackDate( uri1.path, ldf )
     if udate ~= date then
       date = udate .. inlineRed("[Date mismatch]", "warning")       
     end
   end
 elseif date and ulx.url1.service == "webcite" and verifydates == "yes" then 
   local ldf = dateFormat(date)
   if ldf then
     local udate = decodeWebciteDate( uri1.path, ldf )
     if udate == "query" then -- skip
     elseif udate ~= date then
       date = udate .. inlineRed("[Date mismatch]", "warning")      
     end
   end
 elseif not date and ulx.url1.service == "wayback" then
   date = decodeWaybackDate( uri1.path, "iso" )
   if not date then 
     date = inlineRed("[Date error] (1)", "error") 
   end
 elseif not date and ulx.url1.service == "webcite" then
   date = decodeWebciteDate( uri1.path, "iso" )
   if date == "query" then
     date = inlineRed("[Date missing]", "warning")
   elseif not date then 
     date = inlineRed("[Date error] (1)", "error")
   end
 elseif not date then
   date = inlineRed("[Date missing]", "warning")
 end
 ulx.url1.date = date
                                                         -- Format argument 
 local format = trimArg(args.format)
 if not format then
   format = "none"
 else
   if format == "addlpages" then
     if not ulx.url1.date then
       format = "none"
     end
   elseif format == "addlarchives" then
     format = "addlarchives"
   else
     format = "none"
   end
 end
 ulx.url1.format = format
                                                         -- Title argument 
 local title = trimArg(args.title) or trimArg(args.title1)
 ulx.url1.title = title
 
 local rend = createRendering()
 if not rend then
   rend = 'Error in Template:' .. tname .. ': Unknown problem. Please report on template talk page.'
   track["Category:Webarchive template errors"] = 1 
 end
 return rend .. createTracking()

end

return p