#!/usr/local/bin/perl
# 
# Uses docx2txt project of Sandeep Kumar:
# http://docx2txt.sourceforge.net/
# Uses newbie Perl/Tk example code from:
# http://www.geocities.com/binnyva/code/perl/perl_tk_tutorial/
# Uses CakeCMD unzipper because other commandline unzippers 
# not extract corrupt word/xml and word/_rels/document.xml.rels files
#
use strict;
use warnings;
use Tk;
use Win32::GUI();
use File::Path qw(remove_tree);
use File::Basename;
use File::Copy;
use File::Basename;
use File::Path;
use Spreadsheet::ParseExcel;
use Spreadsheet::XLSX::Fmt2007;
use Spreadsheet::XLSX::Utility2007PDP;
use Tk::DialogBox;
use Tk::FileSelect;
use Tk::JComboBox;
use Tk::Menu;
use Tk::Menubutton;
use Tk::NoteBook;
use Tk::Pretty;
use Tk::Button;
use Tk::TableMatrix::Spreadsheet;
BEGIN {
                Win32::SetChildShowWindow(0)
                        if defined &Win32::SetChildShowWindow
        };

# 
# Create the Main Window
# 
my $mw = new MainWindow;
$mw->title('Corrupt office2txt');
# 
# Hides TK logo with my own logo
#
my $icon = $mw->Photo(-file => 'icon_32x32.gif');
$mw->iconimage($icon);
# 
# Declare that there is a menu, create text 
# editor and create a vertical scroll bar
#
my $mbar = $mw -> Menu();
$mw -> configure(-menu => $mbar);
my $textarea = $mw -> Frame(); #Creating Another Frame
my $txt = $textarea -> Text(-width=>80, -height=>22);
my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $txt]);
$txt -> configure(-yscrollcommand=>['set', $srl_y]);
$txt -> grid(-row=>1,-column=>1);
$srl_y -> grid(-row=>1,-column=>2,-sticky=>"ns");
$textarea -> grid(-row=>5,-column=>1,-columnspan=>2);
# 
# Main Menu Choices Setup section
# 
my $file = $mbar -> cascade(-label=>"File", -underline=>0, -tearoff => 0);
my $help = $mbar -> cascade(-label =>"Help", -underline=>0, -tearoff => 0);
# 
# File Menu Choices section
# 
$file -> command(-label =>"Extract I", -underline => 0,
		-command => [\&menuopenClickedNoFrills, "Open"]);
$file -> command(-label =>"Extract II", -underline => 0,
		-command => [\&menuopenClickedCakeCMD, "Open"]);
$file -> command(-label =>"Full Recovery \(Open Office Only\)", -underline => 0,
		-command => [\&recovered, "Open"]);
$file -> command(-label =>"Save", -underline => 0,
		-command => [\&menusavedClicked, "Save"]);
$file -> separator();
$file -> command(-label =>"Exit", -underline => 1,
		-command => sub { exit } );
# 
# Help Menu Choices section 
# 
$help -> command(-label =>"About and Instructions", -command => sub { 
	$txt->delete('1.0','end');
	$txt->insert('end',
	"*****About and Instructions******
	
----Installation Instructions----

1. Extract all the files to a folder.
2. Run corrupt_open_office_recovery.exe by double clicking on it. 
3. Also note the following files need to be in the same folder as 
the executable or the program will not run correctly or at all: 
coffice2txt.exe, rt.exe, doctotext.exe, ppthtml.exe, HtmlAsText.exe, 
xlhtml.exe, CakeCmd.exe, no-frills.exe, Cake3.dll, ICSharpCode.
SharpZipLib.dll, UnzDll.dll, ZipDll.dll and the other
dll files in the orignal installation folder as well as the
icon_32x32.gif. Sevenzipcmd.exe should be in a folder called
'unzipped' of the root.
--Note if you try to recover an Open Office Calc file and you use
Lotus Symophony as your default Open Office file opener, Lotus
Symphony will go in an endless loop and eventually time out after
using up all youyr availabler memory. Use Open Office Calc proper,
Excel or Gnumeric to open the repaired file instead.  To locate
the repaired file, look in the root where this program is run for.
 
-----How to use this program------

1.  Click on the File Menu and choose Full Recovery or one of the two
salvage methods.
2.  Choose your Open Office file with the extension .odt, .ods or .odp.
3.  If you chose a salvage method, your extracted text will be displayed.
If you chose a the Full Recovery choice the application will attempt
to launch a recovered version with the software currently assigned as
the default for the corrupt file's extension, odt, ods or odp.
4.  If you chose a Salvage method, you can Edit, the text as desired.
5.  Next, choose the Save menu choice on the File Menu and save the text 
file to the name and file location you wish you wish fior your salvaged 
text.

-----About-----

This program will extract the text even from damaged or corrupted Microsoft
Office and Open Office files 2.X and 3.X files with the extensions .doc,
docx, xls, xlsx, ppt, pptx, odt, ods and odp as well as possibly the template
and macro variants of these enxtensions such as dot, xlt and pps if they 
are changed to the correct corresponding extensions mentioned.  It may 
succeed at doing so where MS Office Open Office itself fails to salvage 
text. It can also attempt to recover formatting in the form of a full 
Open Office file with a regular, odt, ods or odp extension. At this time 
unfortuantely there is no facility for recovering anything but basic 
formatting for MS Office files through the previously mentioned text 
extractions.  This program can be used as a viewer of text within healthy 
MS Office and Open Office files without having Open Office installed. 

The text extraction is accomplished with the use of the command line
application, SILVERCODERS DocToText. The program also useds command line tools 
from The Chicago Project and ReadText, rt.exe to extract data and text from MS
Office version 97-2003 format files.  The reconstructed version of the Open 
Office file is accomplished by unzipping the Open Office file with the somewhat 
zip corruption immune CakeCMD unzipper. Once unzipped, the manifest/manifest.xml
file is replaced with a greatly simplified version as described here: http://
www.oooforum.org/forum/viewtopic.phtml?t=57600. 

If this application doesn't work, there are other things worth trying 
as summarized here: http://s2services.com/open_office.htm

----Changes to 0.22----

1. First released version.

-----Credits-----

This program is made by Paul D Pruitt (socrtwo) and uses the following
command line applications in its operation: SILVERCODERS DocToText; 
xlhtml and ppthml from The Chicago project; Runar Skaret's 
ReadText; cakecmd.exe unzipper by Leung Yat Chun Joseph; No-Frills Unzipper 
by Ccy; 7-Zip CMD reszipper; and Nirsoft's HTMLAsText. It also uses Perl/Tk 
code for the GUI elements as described here http://www.bin-co.com/perl/
perl_tk_tutorial/. 

Here are the links:
* ReadText:http://members.fortunecity.com/bigg5/frw/diagn.htm
* DocToText: http://silvercoders.com/en/products/doctotext
* Xlhtml and Ppthtml: http://prdownloads.sf.net/chicago/xlHtml-Win32-040.zip
* No-Frills Unzipper: http://godskingsandheroes.info/software/
#no-frills_command_line_unzipper 
* CakeCMD Unzipper:http://www.quickzip.org/softwares-cakecmd and for .Net. 2.0, see: 
http://filehippo.com/download_dotnet_framework_2/
* 7-Zip Command Line Version: http://www.7-zip.org/download.html
* NirSoft's HtmlAsText: http://www.nirsoft.net/utils/htmlastext.html

-----Contact Info-----
* My software website is http://www.godskingsandheroes.info/software/.
* Also visit my data recovery software list http://www.s2services.com.
* My E-Mail: socrtwo\@s2services
* My phone number is 301-493-4982. 
* I do data recovery for \$22 an incident. I sometimes do charity work.
"); 
});
# 
# Open Dialog Box File Extension Declaration section
# 
my $typesopen = [ ['Microsoft and Open Office Document', '.doc .docx .xls .xlsx .ppt .pptx .odt .ods .odp'],['All files', '*'],];
my $typesopen2 = [ ['Open Office Document', '.odt .ods .odp'],['All files', '*'],];
my $typessaved = [ ['Text files', '.txt'], ['All files',   '*'],];
my $lcfilename;
my ($fullname, $dirname, $mainfilepath, $name, $path, $suffix, $but, $lcdirname, $newlongpath);
my ($filetobecopied, $newmanifested, $wfh, $salvagedtxt, $dir, $strFolderPath, $lcbasename, $uslcbasename);
my ($objFSO, $zipfilename, $content, $saved, $docx_name, $editedcontent, $recoveredfile, $cprecoveredfile);
my %docurels;
my @suffixlist;
my $nl = "\r\n";                # Alternative is "\n".
my $lindent = "  ";     # Indent nested lists by "\t", " " etc.
my $lwidth = 80;        # Line width, used for short line justification.
	
	
# 
# Main loop currently activated by selecting the file
# 
MainLoop;
sub menuopenClickedNoFrills {
	$mainfilepath = $mw->getOpenFile(-filetypes => $typesopen, -defaultextension => '.doc .docx .xls .xlsx .ppt .pptx .odt .ods .odp');	  
	return unless $mainfilepath;
	$lcfilename = lc($mainfilepath);
	
	$lcbasename = basename($lcfilename,@suffixlist);
	$lcdirname = dirname($lcfilename);
	print "$lcbasename\r\n\r\n";
	
	$uslcbasename = $lcbasename;
	$uslcbasename =~ s/ /_/g;
	copy($mainfilepath, $uslcbasename) or warn "Unable to copy $mainfilepath to $uslcbasename.\r\n\r\n";
	print "$uslcbasename\r\n\r\n";

	my @file_type   = split(/\./, $mainfilepath);
	my $file_type   = $file_type[$#file_type];
	
	my $random_long = int(rand(10000000));
	$salvagedtxt = $random_long.'.txt';
	print "$salvagedtxt\r\n\r\n";
	
	my $randomname = $random_long.".".$file_type;
	copy($uslcbasename, $randomname) or warn "Unable to copy $uslcbasename to $randomname.\r\n\r\n";
	print "$randomname\r\n\r\n";
	
if($file_type eq 'doc' or $file_type eq 'ppt'){	
	my $processing = "processing.txt";
	open $wfh, "| rt.exe $randomname $salvagedtxt 2> $processing" or warn "Unable to use rt to extract text from $randomname to $salvagedtxt.\r\n\r\n";
	close $wfh;
	
	{
	local $/=undef;
	open FILE, "$salvagedtxt" or warn "Couldn't open $salvagedtxt for writing into the scalar which is in turn written into the text area.\r\n\r\n";
	binmode FILE;
	$_= <FILE>;
	close FILE;
	}
	$txt-> delete('1.0','end');
	$txt -> insert('end',$_ );
	
	unlink $randomname;
	unlink $uslcbasename;
	unlink $salvagedtxt;
	 } else {	
	my $processing = "processing.txt";
	open $wfh, "| doctotext.exe --fix-xml --unzip-cmd=\"no-frills.exe %a %d %f\" $randomname > $salvagedtxt 2> $processing" or warn "Unable to use DocToText to extract text from $randomname to $salvagedtxt.\r\n\r\n";
	close $wfh;
	
{
  local $/=undef;
  open FILE, "$salvagedtxt" or warn "Couldn't open $salvagedtxt for writing into the scalar which is in turn written into the text area.\r\n\r\n";
  binmode FILE;
  $_= <FILE>;
  close FILE;
  
	unlink $randomname;
	unlink $uslcbasename;
	unlink $salvagedtxt;
}

$txt-> delete('1.0','end');
$txt -> insert('end',$_ );
}}
sub menuopenClickedCakeCMD {
	
	$mainfilepath = $mw->getOpenFile(-filetypes => $typesopen, -defaultextension => '.doc .docx .xls .xlsx .ppt .pptx .odt .ods .odp');	  
	return unless $mainfilepath;
	$lcfilename = lc($mainfilepath);
	
    $lcbasename = basename($lcfilename,@suffixlist);
	$lcdirname = dirname($lcfilename);
	print "$lcbasename\r\n\r\n";
	
	$uslcbasename = $lcbasename;
	$uslcbasename =~ s/ /_/g;
	copy($mainfilepath, $uslcbasename) or warn "Unable to copy $mainfilepath to $uslcbasename.\r\n\r\n";
	print "$uslcbasename\r\n\r\n";
	
	my @file_type   = split(/\./, $mainfilepath);
	my $file_type   = $file_type[$#file_type];
	
	my $random_long = int(rand(10000000));
	my $salvagedhtml = $random_long.'.html';
	print "$salvagedhtml\r\n\r\n";
	$salvagedtxt = $random_long.'.txt';
	print "$salvagedtxt\r\n\r\n";
	
	my $randomname = $random_long.".".$file_type;
	copy($uslcbasename, $randomname) or warn "Unable to copy $uslcbasename to $randomname.\r\n\r\n";
	print "$randomname\r\n\r\n";
	
if($file_type eq 'ppt'){
	my $processing = "processing.txt";
	open $wfh, "| ppthtml.exe $randomname > $salvagedhtml  2> $processing" or warn "Unable to use ppthtml to convert the ppt file from $randomname.\r\n\r\n";
	close $wfh;
	
		use Cwd 'abs_path';
	my $cfgabspath = dirname(abs_path($0)).'\test.cfg';
	my $relhtmldir = dirname(abs_path($0)).'\text.html';
	my $reltextdir = dirname(abs_path($0)).'\text.txt';
	print "$cfgabspath\r\n\r\n";
	print "$relhtmldir\r\n\r\n";
	print "$reltextdir\r\n\r\n";

open FH, ">test.cfg";
print FH "[Config]\n";
print FH "OpenInNotepad=0\n";
print FH "CharsPerLine=80\n";
print FH "Source=$relhtmldir\n";
print FH "Dest=$reltextdir\n";
print FH "SkipTitleText=0\n";
print FH "AddLineUnderHeader=0\n";
print FH "SkipTableHeaderText=0\n";
print FH "TableCellDelimit=2\n";
print FH "HeadingLineChars=======\n";
print FH "HorRuleChar==\n";
print FH "ListChars=*o-@#\n";
print FH "ConvertMode=1\n";
print FH "AllowCenterText=0\n";
print FH "AllowRightText=0\n";
print FH "DLSpc=8\n";
print FH "LinksDisplayFormat=%T\n";
print FH "EncloseBoldCharsStart=<<\n";
print FH "EncloseBoldCharsEnd=>>\n";
print FH "EncloseBold=0\n";
print FH "SubFolders=0\n";
print FH "\n";
close FH; 
	
	my $texthtmlname = "text.html";
	copy($salvagedhtml, $texthtmlname) or warn "Unable to copy $uslcbasename to text.html.\r\n\r\n";
	print "$texthtmlname\r\n\r\n";
	my $textname = "text.txt";
	open $wfh, "| HtmlAsText.exe /run \"$cfgabspath\" 2> $processing" or warn "Unable to use HtmlAsText.exe to extract text from the html results file produced by ppthtml.\r\n\r\n";
	close $wfh;
	
{
  local $/=undef;
  open FILE, "$textname" or warn "Couldn't open $textname for writing into the scalar which is in turn written into the text area.\r\n\r\n";
  binmode FILE;
  $_= <FILE>;
  close FILE;
}

$txt-> delete('1.0','end');
$txt -> insert('end',$_ );

	unlink $randomname;
	unlink $uslcbasename;
	unlink $salvagedhtml;
	unlink $texthtmlname;
	unlink $textname;

} elsif ($file_type eq 'xls'){	
	my $processing = "processing.txt";
	open $wfh, "| xlhtml.exe $randomname > $salvagedhtml 2> $processing" or warn "Unable to use xlhtml to extract text from $uslcbasename.\r\n\r\n";
	close $wfh;
	
	use Cwd 'abs_path';
	my $cfgabspath = dirname(abs_path($0)).'\test.cfg';
	my $relhtmldir = dirname(abs_path($0)).'\text.html';
	my $reltextdir = dirname(abs_path($0)).'\text.txt';
	print "$cfgabspath\r\n\r\n";
	print "$relhtmldir\r\n\r\n";
	print "$reltextdir\r\n\r\n";

open FH, ">test.cfg";
print FH "[Config]\n";
print FH "OpenInNotepad=0\n";
print FH "CharsPerLine=80\n";
print FH "Source=$relhtmldir\n";
print FH "Dest=$reltextdir\n";
print FH "SkipTitleText=0\n";
print FH "AddLineUnderHeader=0\n";
print FH "SkipTableHeaderText=0\n";
print FH "TableCellDelimit=2\n";
print FH "HeadingLineChars=======\n";
print FH "HorRuleChar==\n";
print FH "ListChars=*o-@#\n";
print FH "ConvertMode=1\n";
print FH "AllowCenterText=0\n";
print FH "AllowRightText=0\n";
print FH "DLSpc=8\n";
print FH "LinksDisplayFormat=%T\n";
print FH "EncloseBoldCharsStart=<<\n";
print FH "EncloseBoldCharsEnd=>>\n";
print FH "EncloseBold=0\n";
print FH "SubFolders=0\n";
print FH "\n";
close FH; 
	
	my $texthtmlname = "text.html";
	copy($salvagedhtml, $texthtmlname) or warn "Unable to copy $uslcbasename to text.html.\r\n\r\n";
	print "$texthtmlname\r\n\r\n";
	my $textname = "text.txt";
	open $wfh, "| HtmlAsText.exe /run \"$cfgabspath\" 2> $processing" or warn "Unable to use HtmlAsText.exe to extract text from the html results file produced by ppthtml.\r\n\r\n";
	close $wfh;
	
{
  local $/=undef;
  open FILE, "$textname" or warn "Couldn't open $textname for writing into the scalar which is in turn written into the text area.\r\n\r\n";
  binmode FILE;
  $_= <FILE>;
  close FILE;
}

$txt-> delete('1.0','end');
$txt -> insert('end',$_ );

	unlink $randomname;
	unlink $uslcbasename;
	unlink $salvagedhtml;
	unlink $texthtmlname;
	unlink $textname;
} else {
	my $processing = "processing.txt";
	my $randomzipfilename = $random_long.'.zip';
	copy($randomname, $randomzipfilename);
	print "$randomzipfilename\r\n\r\n";
	
	my $cakecmdextract = 'CakeCmd.exe extract';
	
	open $wfh, "| doctotext.exe --fix-xml --unzip-cmd=\"$cakecmdextract %a %f %d\" $randomzipfilename > $salvagedtxt 2> $processing" or warn "Unable to use DocToText to extract text from $randomzipfilename.\r\n\r\n";
	close $wfh;
	rename $randomzipfilename, $randomname;
	
{
  local $/=undef;
  open FILE, "$salvagedtxt" or warn "Couldn't open $salvagedtxt for writing into the scalar which is in turn written into the text area.\r\n\r\n";
  binmode FILE;
  $_= <FILE>;
  close FILE;
}

$txt-> delete('1.0','end');
$txt -> insert('end',$_ );

	unlink $randomname;
	unlink $uslcbasename;
	unlink $salvagedtxt;
	
}}
sub menusavedClicked {

			$editedcontent = $txt->Contents();
			$saved = $mw->getSaveFile(-filetypes => $typessaved,
                              -defaultextension => '.txt');
							  return unless $saved;
open (MYFILE, ">$saved");
print MYFILE "$editedcontent";
close (MYFILE);
close $saved;
}
sub recovered {
	$mainfilepath = $mw->getOpenFile(-filetypes => $typesopen2, -defaultextension => '.odt, ods, .odp');	  
	return unless $mainfilepath;
	$lcfilename = lc($mainfilepath);
	
    $lcbasename = basename($lcfilename,@suffixlist);
	$lcdirname = dirname($lcfilename);
	print $lcbasename;
	
	$uslcbasename = $lcbasename;
	$uslcbasename =~ s/ /_/g;
	copy($mainfilepath, $uslcbasename) or warn "unable to copy $mainfilepath to $uslcbasename.\r\n\r\n";
	
	$zipfilename = $uslcbasename.'.zip';
	rename $uslcbasename, $zipfilename;
#
# unzipping of the Open Office Text file
#
	my $processing = "processing.txt";
	open $wfh, "| cakecmd.exe extract $zipfilename * unzipped 2> $processing" or warn "Unable to unzip $zipfilename with CakeCMD.\r\n\r\n";
	close $wfh;
	rename $zipfilename, $uslcbasename;
	
#
# manifest.xml is replaced by a copy with most of the style XML removed as described in Open Office forums
#
	use File::Copy;
	$filetobecopied = 'do_not_remove\manifest.xml';
	$newmanifested = 'unzipped\META-INF\manifest.xml';
	copy($filetobecopied,$newmanifested) or warn "Copy of do_not_remove\\manifest.xml to unzipped\\META-INF\\manifest.xml failed.\r\n\r\n";
	chdir('unzipped') or warn "Cannot change directory to unzipped.\r\n\r\n";
	$recoveredfile = 'recovered_'.$uslcbasename;
	
	open $wfh, "| ..\\sevenzipcmd.exe a -tzip $recoveredfile \* 2> $processing" or warn "Unable to rezip $recoveredfile.\r\n\r\n";
	close $wfh;
	
	# `../sevenzipcmd.exe a -tzip $recoveredfile \*`;
	$cprecoveredfile = '../'. $recoveredfile;
	move($recoveredfile, $cprecoveredfile) or warn "File cannot move $recoveredfile to $cprecoveredfile.\r\n\r\n";
	chdir('../') or warn "Cannot change directory back to root.\r\n\r\n";
	remove_tree('unzipped') or warn "Cannot remove uzipped directory.\r\n\r\n";
unlink $uslcbasename;
system "$recoveredfile";
}

