User Tools

Site Tools


ged2wiki

This is an old revision of the document!


(Back to the start page)

Ged2Wiki

Ged2Wiki is a project to convert a GEDCOM into DokuWiki pages so that they can be added to this (or any other DokuWiki-based) site.

This program is written in the Perl programming language – and while I had initially thought that the GEDCOM module from CPAN would be a good starting point, I later decided that starting from the ground up was easier (for me).

As of 24 Nov 2007, I have written only the parser. This version reads a GEDCOM file and stores all the relevant data in hashes. It then spits out basic data about each family to stdout. Eventually, each family will be written to a separate file.

Here's what it looks like so far:

#
# Ged2Wiki.pl - Perl program to convert a GEDCOM file to plain text Wiki pages
#
# The input format is based on "The GEDCOM Standard" Release 5.5, as published
# by the Family History Department of The Church of Jesus Christ of Latter-day
# Saints, 2 January 1996.
#
# The output format is a series of plain-text files based on DokuWiki
#
# For questions, comments, bug reports, etc., please visit this site:
#    http://s560.com/ged2wiki
#

# The globals for the GEDCOM input lines
my $currline;
my $nextline;
my $currlevel;
my $nextlevel;

# The globals for the decoded data about individuals
# (Index = Individual Xref tag)
my %name;         # Full Name
my %sex;          # 'M' or 'F'
my %birth;        # Birth Date and/or Place
my %death;        # Death Date and/or Place
my %burial;       # Burial Date and/or Place
my %familydown;   # Xref tag for Individual's Family (FAMS)
my %familyup;     # Xref tag for Parent's Family (FAMC)
my %inote;        # Note
my %private;      # Is data private? (If so, do not show details on output)

# most of the data above can have an associated note
my %nnote;        # Note for Name
my %bnote;        # Note for Birth
my %dnote;        # Note for Death
my %burnote;      # Note for Burial
my %fdnote;       # Note for Family (Up)
my %funote;       # Note for Family (Down)

# The globals for the decoded data about families
# (Index = Family Xref tag)
my %familyflag;   # This entry will exist for ALL families
my %husband;      # Individual tag for husband
my %wife;         # Individual tag for wife
my %marriage;     # Marriage Date and/or Place
my %fnote;        # Note
my %children;     # List of individual tags for children
my %mnote;        # Note for Marriage event

# The global for records other than Individuals and Families
my %note;         # Note

# Prime the parser by reading the first line
fetch_line();

# "Read" the first line
read_line();

# The first line must be "0 HEAD"
if ($currline !~ /^HEAD/) {
   print "Input does not appear to be a GEDCOM file\n";
   exit;
}

# First line is good.  Parse the rest of the header
parse_header();

# If it's a Submission record, parse it here
if ($currline =~ /^\@(.*?)\@\s+SUBN/) {
   parse_subn($1);
}

# Loop here, reading the "Record" entries until we find the Trailer
while (1) {

   # First, check for the Trailer record
   if ($currline =~ /^TRLR/) {
      last; # We're done with the input - break out of this loop
   }

   # Each Record must be in this format
   if ($currline !~ /^\@(.*?)\@\s+(\w+)\s*/) {
      print "Unexpected Record at line $.:\n>> $currline\n";
      exit;
   }
   my $xref = $1;
   my $tag = $2;
   my $data = "$'";

   # If it's a "FAM" record, parse it here
   if ($tag eq "FAM") {
      parse_family($xref);
      next;
   }

   # If it's a "INDI" record, parse it here
   if ($tag eq "INDI") {
      parse_individual($xref);
      next;
   }

   # If it's a "NOTE" record, parse it here
   if ($tag eq "NOTE") {
      parse_note($xref, $data);
      next;
   }

   # If it's an unknown record, it's an error
   if ($tag !~ /^(OBJE|REPO|SOUR|SUBM)$/) {
      print "Unexpected Tag at line $.:\n>> $currline\n";
      exit;
   }

   # Ignore this type of record
   parse_ignore();
}

print "GEDCOM file parsed successfully\n";

# Generate all the output here ...
foreach my $fxref (keys %familyflag) {
   # Output data for family $fxref
   output_family($fxref);
}
exit;

# Read one line from the input file (and strip unwanted whitespace)
sub fetch_line {
   $nextline = <>;
   $nextline =~ s/^\s+//;        # Strip leading whitespace
   $nextline =~ s/[\s\r\n]*$//;  # Strip trailing whitespace (and newline)
   if ($nextline !~ /^(\d{1,2})\s+/) {
      print "Invalid GEDCOM data at line $.:\n>> $nextline\n";
      exit;
   }
   $nextlevel = $1;
   $nextline = "$'";
}

# One input line -> $currline (concatenating as needed)
sub read_line {

   $currline = $nextline;
   $currlevel = $nextlevel;

   # Special case: Do not read beyond the Trailer
   if ($currline =~ /^TRLR/) {
      $nextline = "";
      $nextlevel = "";
      return;
   }

   fetch_line();

   # Special case: A tag without any parameters - Add a single space
   if ($currline =~ /^(\@\w+\@)?\s+\w+$/) {
      $currline .= " ";
   }

   while ($nextlevel == $currlevel + 1 && $nextline =~ /^CON([CT])\s*/) {
      if ($1 eq "C") {
         $currline .= "$'";
      } else {
         $currline .= "\n$'";
      }
      fetch_line();
   }
}

# Parse the header
sub parse_header {
   parse_ignore();   # For now, simply ignore it

   # Format of Header (brackets denote optional entries)
   #   0 HEAD
   #   1 SOUR
   # [ 1 DEST ]
   # [ 1 DATE ]
   #   1 SUBM
   # [ 1 SUBN ]
   # [ 1 FILE ]
   # [ 1 COPR ]
   #   1 GEDC
   #   1 CHAR
   # [ 1 LANG ]
   # [ 1 PLAC ]
   # [ 1 NOTE ]

}

# Parse the submission record
sub parse_subn {
   parse_ignore();   # For now, simply ignore it

   # Format of Submission record (all sub-fields are optional)
   #   0 @XREF@ SUBN
   # [ 1 SUBM ]
   # [ 1 FAMF ]
   # [ 1 TEMP ]
   # [ 1 ANCE ]
   # [ 1 DESC ]
   # [ 1 ORDI ]
   # [ 1 RIN ]

}

# Parse a Family record
sub parse_family {
   my $ftag = $_[0];

   # All families must have this marker entry
   $familyflag{$ftag} = 1;

   # Format of Family record (brackets denote optional entries)
   #   0 @XREF@ FAM
   # { 1 ANUL|CENS|DIV|DIVF|ENGA|MARB|MARC|MARL|MARS|EVEN }
   # { 1 MARR }   ->    %
   # [ 1 HUSB ]   ->    %
   # [ 1 WIFE ]   ->    %
   # { 1 CHIL }   ->    %
   # [ 1 NCHI ]
   # { 1 SUBM }
   # [ 1 SLGS ]
   # { 1 SOUR }
   # { 1 OBJE }
   # { 1 NOTE }   ->    %fnote
   # { 1 REFN }
   # [ 1 RIN ]
   # [ 1 CHAN ]

   # Read the first sub-line
   read_line();

   while ($currlevel > 0) {

      my $work;

      # Record this family's husband
      # (If there are multiple entries of this type, use only the first)
      if ($currline =~ /^HUSB\s+\@(.*?)\@/) {
         if ($husband{$ftag} eq "") {
            $husband{$ftag} = $1;
         }
      }

      # Record this family's wife
      # (If there are multiple entries of this type, use only the first)
      if ($currline =~ /^WIFE\s+\@(.*?)\@/) {
         if ($wife{$ftag} eq "") {
            $wife{$ftag} = $1;
         }
      }

      # Record this family's children
      # (The result is a comma-separated list of Individual tags)
      if ($currline =~ /^CHIL\s+\@(.*?)\@/) {
         if ($children{$ftag} eq "") {
            $children{$ftag} = $1;
         } else {
            $children{$ftag} .= ",$1";
         }
      }

      # Record this family's marriage event
      if ($currline =~ /^MARR/) {
         if ($marriage{$ftag} eq "") {
            $marriage{$ftag} = read_event();
         }
         $work = read_note();
         if ($work ne "") {
            $mnote{$ftag} .= $work;
         }
         next;
      }

      # Record any notes associated with this family
      if ($currline =~ /^NOTE\s+/) {
         $fnote{$ftag} .= "$'";
      }

      # If we get here, we need to ignore the remaining lines of this sub-record
      parse_ignore();

   }

}

# Parse an Individual record
sub parse_individual {
   my $itag = $_[0];

   # Format of Individual record (brackets denote optional entries)
   #   0 @XREF@ INDI
   # { 1 NAME }   ->    %name;
   # [ 1 SEX ]    ->    %sex;
   # { 1 BIRT }   ->    %birth;
   # { 1 DEAT }   ->    %death;
   # { 1 BURI }   ->    %burial;
   # { 1 CHR|CREM|ADOP|BAPM|BARM|BASM|BLES|CHRA|CONF|FCOM|
   #     ORDN|NATU|EMIG|IMMI|CENS|PROB|WILL|GRAD|RETI|EVEN }
   # { 1 CAST|DSCR|EDUC|IDNO|NATI|NCHI|NMR|OCCU|PROP|RELI|RESI|SSN|TITL }
   # { 1 BAPL|CONL|ENDL|SLGC }
   # { 1 FAMC }   ->    &familyup;
   # { 1 FAMS }   ->    %familydown;
   # { 1 SUBM }
   # { 1 ASSO }
   # { 1 ALIA }
   # { 1 ANCI }
   # { 1 DESI }
   # { 1 SOUR }
   # { 1 OBJE }
   # { 1 NOTE }   ->    %inote;
   # [ 1 RFN ]
   # [ 1 AFN ]
   # { 1 REFN }
   # [ 1 RIN ]
   # [ 1 CHAN ]

   # Read the first sub-line
   read_line();

   while ($currlevel > 0) {

      my $work;

      # Record this person's name
      # (If there are multiple entries of this type, use only the first)
      if ($currline =~ /^NAME\s+/) {
         if ($name{$itag} eq "") {
            $name{$itag} = "$'"; # Save the full name
         }
         $work = read_note();
         if ($work ne "") {
            $nnote{$itag} = $work;
         }
         next;
      }

      # Record this person's gender
      if ($currline =~ /^SEX\s+/) {
         if ($sex{$itag} eq "") {
            $sex{$itag} = uc substr("$'", 0, 1);
         }
      }

      # Record this person's birth date/place
      if ($currline =~ /^BIRT/) {
         if ($birth{$itag} eq "") {
            $birth{$itag} = read_event();
         }
         $work = read_note();
         if ($work ne "") {
            $bnote{$itag} .= $work;
         }
         next;
      }

      # Record this person's death date/place
      if ($currline =~ /^DEAT/) {
         if ($death{$itag} eq "") {
            $death{$itag} = read_event();
         }
         $work = read_note();
         if ($work ne "") {
            $dnote{$itag} .= $work;
         }
         next;
      }

      # Record this person's burial date/place
      if ($currline =~ /^BURI/) {
         if ($burial{$itag} eq "") {
            $burial{$itag} = read_event();
         }
         $work = read_note();
         if ($work ne "") {
            $burnote{$itag} .= $work;
         }
         next;
      }

      # Record the family in which this person is a child
      # (There may be multiple entries of this type)
      if ($currline =~ /^FAMC\s+\@(.*?)\@/) {
         if ($familyup{$itag} eq "") {
            $familyup{$itag} = $1;
         } else {
            $familyup{$itag} .= ",$1";
         }
         $work = read_note();
         if ($work ne "") {
            $funote{$itag} = $work;
         }
         next;
      }

      # Record the family in which this person is a spouse
      # (There may be multiple entries of this type)
      if ($currline =~ /^FAMS\s+\@(.*?)\@/) {
         if ($familydown{$itag} eq "") {
            $familydown{$itag} = $1;
         } else {
            $familydown{$itag} .= ",$1";
         }
         $work = read_note();
         if ($work ne "") {
            $fdnote{$itag} = $work;
         }
         next;
      }

      # Record any notes associated with this person
      if ($currline =~ /^NOTE\s+/) {
         $inote{$itag} .= "$'";
      }

      # If we get here, we need to ignore the remaining lines of this sub-record
      parse_ignore();

   }

}

# Parse a Note record
sub parse_note {
   # Save the note's data
   $note{$_[0]} = $_[1];

   # Format of Note record
   #   0 @XREF@ NOTE
   # { 1 SOUR }
   # { 1 REFN }
   # [ 1 RIN ]
   # [ 1 CHAN ]

   # And ignore all sub-records
   parse_ignore();
}

# Read lines until we find a level the same or lower than the current line
# Examples:
# If the current line is "0 HEAD", it will read until it finds another "0" line.
# if the current line is "1 NAME", it will read until it finds a "1" or a "0" line.
# In either case, the "current" line will be the line AFTER the ignored block
sub parse_ignore {
   my $thislevel = $currlevel;
   while (1) {
      read_line();
      if ($currlevel <= $thislevel) {
         last;
      }
   }
}

# Read and parse an "Event Detail" entry
sub read_event {
   my $thislevel = $currlevel;
   my $edate;
   my $eplace;
   while (1) {
      read_line();
      if ($currlevel <= $thislevel) {
         last;
      }
      if ($currline =~ /^DATE\s+/) {
         $edate = "$'";
      }
      if ($currline =~ /^PLAC\s+/) {
         $eplace = "$'";
      }
   }
   if ($eplace ne "" && $edate ne "") {
      "$edate, $eplace";
   } else {
      if ($eplace ne "") {
         $eplace;
      } else {
         $edate;
      }
   }
}

# Read the sub-records, looking for a NOTE entry
sub read_note {
   my $thislevel = $currlevel;
   my $thisnote;
   while (1) {
      read_line();
      if ($currlevel <= $thislevel) {
         last;
      }
      if ($currline =~ /^NOTE\s+/) {
         $thisnote .= "$'";
      }
   }
   $thisnote;
}

# Output a single family
sub output_family {
   my $ftag = $_[0];

   print "\nFamily $ftag\n";
   print "Husband: $name{$husband{$ftag}}\n";
   print "   Wife: $name{$wife{$ftag}}\n";

   my @children = split(/\,/, $children{$ftag});
   foreach my $ctag (@children) {
      print "  Child: $name{$ctag}\n";
   }
}
ged2wiki.1195945427.txt.gz · Last modified: 2007/11/24 17:03 by jims