opexxx
2/10/2017 - 11:26 PM

A Powershell script which uses itextsharp.dll library to extract date from PDF files located on a network share and check whether they are v

A Powershell script which uses itextsharp.dll library to extract date from PDF files located on a network share and check whether they are valid. Email the result to people

# PDF Files Check Script (created by beancurd1, please distribute the code with this session, thanks)
# It uses itextsharp.dll (downloaded from SourceForge) to parse PDF files, extract the first date it found
# compare it against a predefined Date. Email the PDF file names to people if they doesn't match the predefined date

Add-Type -Path .\itextsharp.dll
$validDate = "11 Dec 2015" 

$day = ([datetime]$validDate).ToString('dd') ; $day = $day -replace "^0", ""
#Define valid Month+Year format here, this will combine with $day in the search
$validMYArray = @(([datetime]$validDate).ToString(' MMM yyyy'))
$validMYArray += ([datetime]$validDate).ToString(' MMMM yyyy')
$validMYArray += ([datetime]$validDate).ToString('/MM/yyyy')
$validMYArray += ([datetime]$validDate).ToString('-MMM-yy')

# PDF Counters
$countTotal = $countGood = $countBad = 0
$badDate = $foundDate = ""
$PDFPath="\\server\share"
$badPDF="Bad PDF (e.g. incorrect date, empty date):`n`n"

Write-Host "Mapping a Drive..."
New-PSDrive -Name NetworkDrive -PSProvider FileSystem -Root $PDFPath

Write-Host "Parsing PDF Files..."
#################################################################
### Search PDF Files from UNC folder, parse each PDF          ### 
### output PDFs with incorrect date                           ###
#################################################################
Get-ChildItem -Path NetworkDrive:\ -Filter *.pdf -Recurse |
    Foreach-Object{
	      $countTotal++
        $reader = New-Object iTextSharp.text.pdf.pdfreader -ArgumentList $_.FullName
        $pageText = [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, 1) -join "" -split "`n"

        # search each line, look for a date which match the format defined in above
:loop   ForEach ($line in $pageText) {
            ForEach ($validMY in $validMYArray) {
                if ($line -cmatch "0?$day$validMY") {
                    $countGood++
                    $foundDate="Yes"
                    break loop
                } elseif ($line -cmatch "[0-3]?\d$validMY" -and $badDate -eq "") {
                    # extract incorrect date and append it to the PDFs
                    $badDate = [regex]::Matches($line, "([0-3]?\d$validMY)")[0].Groups[1].Value
                    break loop
                }
            }
        }
        if ($foundDate -ne "Yes") {
            $countBad++
            $badPDF += $_.FullName + " ($badDate)`t`n" #<-Insert a Tab character before `n to avoid Outlook Extra Line Break issue
            $badDate = ""
        }
        $foundDate = ""
    }
$reader.Dispose() #<-Destroy/free the Object, it locks the PDF files otherwise

# Unmap the drive
Remove-PSDrive -Name NetworkDrive
$stopWatch.Stop()

# Remove "\\server\share\" from file path
$badPDF = $badPDF -replace "\\\\.*\\", ""
Write-Host "$badPDF`n`n$countTotal PDFs, Good=$countGood, Bad=$countBad $($stopWatch.Elapsed.TotalSeconds) sec
$PDFPath$validDateNum"

Write-Host "Email Result..."
$messageParameters = @{
    Subject = "PDF Checked has finish" 
    Body = "Say something here"
    From = "a@yahoo.com" 
    To = "b@yahoo.com" 
    SmtpServer = "mailserver" 
} 
Send-MailMessage @messageParameters