Wednesday, October 9, 2013

LG Neon message parsing

Sometimes I enjoy looking back over very old text messages from my phone. My current phone is an old LG Neon. It can save text messages to a text file on the SD card so I don't loose them when the text message memory on the phone fills up, but can not save the multimedia messages. I desire to keep all of these and view them later. I also decided that I would like to keep the data on Google Drive, and to get some experience with Google Apps Script. This all leads to taking the save files from my phone and making them useful with a Google Apps Script.

The LG Neon saves text messages as UTF-16LE text, however the leading character of the text indicates that they are saved as UTF-16BE. The only character set that is documented to be supported in apps script is UTF-8, so I did the UTF-16 decoding manually in the script.

function decodeLgText(bytes) {
  // The LG Neon saves text messages in UTF-16LE, with the header bytes for UTF-16BE
  
  var str = "";
  for (var i = 2; i < bytes.length; i += 2) {
    var charcode = bytes[i] & 0xff | ((bytes[i+1] & 0xff) << 8);
    if (charcode < 0xd800 || charcode >= 0xe000) {
      str += String.fromCharCode(charcode);
    } else {
      i += 2;
      var charcode1 = bytes[i] & 0xff | ((bytes[i + 1] & 0xff) << 8);
      charcode = 0x10000 + ( ((charcode & 0x3ff) << 10) | (charcode1 & 0x3ff) );
      str += String.fromCharCode(charcode);
    }
  }
  return str;
}

Two blocks of text from the export file looks like this:
1) From : +1555555555(Sample Name)
   Sent : 2013/04/11 17:58
   Contents :
   See you

227) To : +15555555555(Dear Friend)
   Sent : 2013/04/03 20:37
   Contents :
   Back in my very warm fun fur hammock tonight, ther
   e was some pretty snow today, and i get to sleep i
   n a bit tomorrow :)
Sweet dreams! <3

The contents section of the records is interesting, the raw text message is broken up with "\r\n" line breaks, and if there was a line break in the text message it only has a "\n". The record ends with a double "\r\n" line break. The solution to parsing this that I finally settled on is two parts, the first decodes each record into an array of lines, with the indentation and message number stripped off, and the second turns the array of lines into a useful data structure.
function forEachLgBlock(blob, body) {
  var blocks = decodeLgText(blob.getBytes());
  blocks = blocks.split("\r\n\r\n");
  
  for(var blockIndex = 0; blockIndex < blocks.length; blockIndex++) {
    var block = blocks[blockIndex];
    if(block == "") {
      continue;
    }
    
    var lines = block.split("\r\n", -1);
    lines[0] = lines[0].replace(/^\d+\)/, "  ");
    for(var lineIndex = 0; lineIndex < lines.length; lineIndex++) {
      lines[lineIndex] = lines[lineIndex].replace(/^   /, "");
    }
    
    body(lines);
  }
}

function forEachLgSms(blob, body) {
  forEachLgBlock(blob, function(lines) {
    var m = {
      type: "",
      subject: "",
      date: "",
      from: new Array(),
      to: new Array(),
      body: [ { type: "text", value: "" } ]
    };
    
    var lineIndex;
    for(lineIndex = 0; lineIndex < lines.length; lineIndex++) {
      var line = lines[lineIndex];
      var match;

      match = line.match(/^To : (.*?)\((.*?)\)$/);
      if(match) {
        m.to.push([ { address: match[1], name: match[2] } ]);
        m.type = "outgoing";
        continue;
      }
      
      match = line.match(/^To : (.*)$/);
      if(match) {
        m.to.push([ { address: match[1] } ]);
        m.type = "outgoing";
        continue;
      }
      
      match = line.match(/^From : (.*?)\((.*?)\)$/);
      if(match) {
        m.from.push([ { address: match[1], name: match[2] } ]);
        m.type = "incoming";
        continue;
      }
      
      match = line.match(/^From : (.*)$/);
      if(match) {
        m.from.push([ { address: match[1] } ]);
        m.type = "incoming";
        continue;
      }
      
      match = line.match(/^Sent : (\d\d\d\d)\/(\d\d)\/(\d\d) (\d\d:\d\d)$/);
      if(match) {
        m.date = match[1] + "-" + match[2] + "-" + match[3] + " " + match[4];
        continue;
      }
      
      match = line.match(/^Contents :$/);
      if(match) {
        break;
      }
    }

    for(lineIndex++; lineIndex < lines.length; lineIndex++) {
      m.body[0].value = m.body[0].value + lines[lineIndex];
    }
    m.body[0].value.replace(/\n\r/g, "\n");
    
    body(m);
  });
}
In the main program this parser is called with a callback, making it appear similar to a loop.
var messages = new Array();

  // sms messages
  var folder = DriveApp.getFolderById(...);
  var files = folder.getFiles();
  while(files.hasNext()) {
    var file = files.next();
    forEachLgSms(file.getBlob(), function(message) {
      messages.push(message);
    });
  }
The two examples above would parse into this:
[
  {
    type: "incoming",
    subject: "",
    date: "2013-04-11 17:58",
    from: [ { address: "+1555555555", name: "Sample Name" } ],
    to: [ ],
    body: [ { type: "text", value: "See you" } ]
  },
  {
    type: "outgoing",
    subject: "",
    date: "2013-04-03 20:37",
    from: [ ],
    to: [ { address: "+1555555555", name: "Dear Friend" } ],
    body: [ { type: "text", value: "Back in my very warm fun fur hammock tonight, there was some pretty snow today, and i get to sleep in a bit tomorrow :)\nSweet dreams! <3" } ]
  }
]

What I did with the multi-media messages will be covered another time.

No comments:

Post a Comment