<?php
/*
This is a function that transforms the JSON you get from Google Takeout when you export your Hangouts history
into a PHP array which can be used to further manipulate the data.
A use case is my hangouts parser at http://hangoutparser.jay2k1.com/ -- a description can be seen at
http://blog.jay2k1.com/2014/11/10/how-to-export-and-backup-your-google-hangouts-chat-history/
You feed the function with the JSON, and in return you get a nice array holding all the conversations.
As a parameter, it expects the full filename (including path) to the Hangouts.json file
It returns an array in this format:
$array[dates] contains the days (YYYY-MM-DD) that have at least one message
$array[0..N] array of conversations
$array[0..N][name] conversation name. afaik only group chats can have one
$array[0..N][type] conversation type. can be either STICKY_ONE_TO_ONE or GROUP
$array[0..N][msg_count] message count for that conversation
$array[0..N][members] array of conversation members where key = sender ID and value = sender name
$array[0..N][dates] array of all dates of messages in this conversation
$array[0..N][messages] array of messages
$array[0..N][messages][0..N] array with message details
$array[0..N][messages][0..N][timestamp] timestamp of message in unixtime (actually, unixtime plus six more digits)
$array[0..N][messages][0..N][datetime] timestamp of message in YYYY-MM-DD HH:MM:SS format
$array[0..N][messages][0..N][sender_id] google's chat ID of the message's sender
$array[0..N][messages][0..N][sender] name of the message's sender (the "from")
$array[0..N][messages][0..N][event_type] type of message/event. can be RENAME_CONVERSATION, HANGOUT_EVENT, REGULAR_CHAT_MESSAGE, ADD_USER, REMOVE_USER, SMS, OTR_MODIFICATION, VOICEMAIL and maybe more...
$array[0..N][messages][0..N][message] the actual message text
$array[0..N][messages][0..N][message_html] HTML version of message text, if applicable (links are clickable, images are embedded etc)
So you could call it like this: $my_conversations = hangoutsToArray('/tmp/hangouts.json');
*/
// stop notices from showing
//error_reporting(E_ALL ^ E_NOTICE);
function replaceSmileys($string) {
// replaces UTF-8 graphical emoticons by their ASCII equivalents
// list of emoji codes taken from https://aprescott.com/posts/hangouts-emoji
'/\x{1F41D}/u', // -<@% ? honeybee
'/\x{1F435}/u', // :(|) ? monkey face
'/\x{1F437}/u', // :(:) ? pig face
'/\x{1F473}/u', // (]:{ ? man with turban
'/\x{1F494}/u', // <\3 </3 ? broken heart
'/\x{1F49C}/u', // <3 ? purple heart
'/\x{1F4A9}/u', // ~@~ ? pile of poo
'/\x{1F600}/u', // :D :-D ? grinning face
'/\x{1F601}/u', // ^_^ ? grinning face with smiling eyes
'/\x{1F602}/u', // XD
'/\x{1F603}/u', // :) :-) =) ? smiling face with open mouth
'/\x{1F604}/u', // =D ? smiling face with open mouth and smiling eyes
'/\x{1F605}/u', // ^_^;; ? smiling face with open mouth and cold sweat
'/\x{1F607}/u', // O:) O:-) O=) ? smiling face with halo
'/\x{1F608}/u', // }:) }:-) }=) ? smiling face with horns
'/\x{1F609}/u', // ;) ;-) ? winking face
'/\x{1F60E}/u', // B) B-) ? smiling face with sunglasses
'/\x{1F610}/u', // :-| :| =| ? neutral face
'/\x{1F611}/u', // -_- ? expressionless face
'/\x{1F613}/u', // o_o; ? face with cold sweat
'/\x{1F614}/u', // u_u ? pensive face
'/\x{1F615}/u', // :\ :/ :-\ :-/ =\ =/ ? confused face
'/\x{1F616}/u', // :S :-S :s :-s ? confounded face
'/\x{1F617}/u', // :* :-* ? kissing face
'/\x{1F618}/u', // ;* ;-* ? face throwing a kiss
'/\x{1F61B}/u', // :P :-P =P :p :-p =p ? face with stuck-out tongue
'/\x{1F61C}/u', // ;P ;-P ;p ;-p ? face with stuck-out tongue and winking eye
'/\x{1F61E}/u', // :( :-( =( ? disappointed face
'/\x{1F621}/u', // >.< >:( >:-( >=( ? pouting face
'/\x{1F622}/u', // T_T :'( ;_; ='( ? crying face
'/\x{1F623}/u', // >_< ? persevering face
'/\x{1F626}/u', // D: ? frowning face with open mouth
'/\x{1F62E}/u', // o.o :o :-o =o ? face with open mouth
'/\x{1F632}/u', // O.O :O :-O =O ? astonished face
'/\x{1F634}/u', // O.O :O :-O =O ? astonished face
'/\x{1F635}/u', // x_x X-O X-o X( X-( ? dizzy face
'/\x{1F638}/u' // :X) :3 (=^..^=) (=^.^=) =^_^= ? grinning cat face with smiling eyes
);
'-<@%',
':(|)',
':(:)',
'(]:{',
'</3',
'<3',
'~@~',
':D',
'^_^',
'XD',
':)',
'=D',
'^_^;;',
'O:)',
'}:)',
';)',
'B-)',
':|',
'-_-',
'o_o;',
'u_u',
':/',
':S',
':*',
';*',
':P',
';P',
':(',
'>.<',
":'(",
'>_<',
'D:',
':o',
':O',
'-_-Zzz',
'x_x',
':3'
);
}
/**
* string $jsonfile: file name and path of Hangouts.json (must be readable by the PHP user), e.g. '/tmp/Hangouts.json'
* int $version: 1 for Google's old format (until March 2018) (first line has "conversation_state"), 2 for newer version
*/
function hangoutsToArray($jsonfile, $version = 2) {
// set the desired timestamp format here
// the default is 'Y-m-d H:i:s' which is YYYY-MM-DD HH:mm:ss.
$timestamp_format = 'Y-m-d H:i:s';
$handle = @fopen($jsonfile, "r");
if (!$handle) { die('failed to open input file '.$file.' for reading'); }
$conversation_count = 0;
$local_linecount = 0;
if ($version == 1) {
$startregex = '/"conversation_state" : {/';
$endregex = '/^ }/';
} else {
$startregex = '/^ \{$/';
$endregex = '/^ \},?$/';
}
// we're going to read the whole file and split up the conversations into the $outfiles array
while (($buffer = fgets($handle, 4096)) !== false) {
$local_linecount++;
// handle conversation ends
$outfiles[$conversation_count]['data'] .= "}";
$outfiles[$conversation_count]['lines'] = $local_linecount;
}
// handle conversation starts
$conversation_count++;
$local_linecount = 0;
$outhandle = true;
$outfiles[$conversation_count]['data'] = ($version == 1 ? "{\n" : "");
error_log("* reading conversation $conversation_count ...");
}
// write data from conversation
if (isset($outhandle)) { $outfiles[$conversation_count]['data'] .= $buffer; }
}
$cstate = ($version == 1 ? 'conversation_state' : 'conversation');
// now we have a bunch of array entries, each containing one conversation. Now we're gonna read them, delete them and parse their contents.
for ($a = 1; $a <= $total; $a++) {
error_log("* decoding JSON of conversation ".$a."/".$total."...");
// reduce memory footprint
$outfiles[$a]['data'] = '';
// make sure there's a conversation inside
if (!isset($decoded[$cstate])) {
error_log('skipped empty conversation '.($a));
continue;
}
// first, get metadata
$return[$a]['type'] = $decoded[$cstate]['conversation']['type'];
$return[$a]['msgcount'] = ($version == 1 ?
sizeof($decoded['conversation_state']['event']) : sizeof($decoded['events']));
$return[$a]['name'] = (isset($decoded[$cstate]['conversation']['name']) ?
$decoded[$cstate]['conversation']['name'] : "");
// conversation participants
foreach ($decoded[$cstate]['conversation']['participant_data'] as $participant) {
$id = $participant['id']['chat_id'];
// use "unknown_<chat_id>" as name if they don't have a fallback_name
$name = (isset($participant['fallback_name']) ?
$participant['fallback_name'] : 'unknown_'.$id);
$return[$a]['members'][$id] = $name;
}
// loop through messages/events
$return[$a]['messages'] = array();
$lastcount = 0;
for ($k = 0; $k < $return[$a]['msgcount']; $k++) {
$this_event = ($version == 1 ? $decoded['conversation_state']['event'][$k] : $decoded['events'][$k]);
// get (unixtime) timestamp
$return[$a]['messages'][$k]['timestamp'] = $this_event['timestamp'];
// get (human readable) timestamp
$return[$a]['messages'][$k]['datetime'] = date($timestamp_format,substr($this_event['timestamp'], 0, 10));
// get sender id
$return[$a]['messages'][$k]['sender_id'] = $this_event['sender_id']['chat_id'];
// get sender name
$return[$a]['messages'][$k]['sender'] = (isset($return[$a]['members'][$return[$a]['messages'][$k]['sender_id']]) ?
$return[$a]['members'][$return[$a]['messages'][$k]['sender_id']] : 'unknown_'.$id);
// get event type
$return[$a]['messages'][$k]['event_type'] = $this_event['event_type'];
// handle event
switch ($return[$a]['messages'][$k]['event_type']) {
case 'RENAME_CONVERSATION':
$newname = $this_event['conversation_rename']['new_name'];
$oldname = $this_event['conversation_rename']['old_name'];
$return[$a]['messages'][$k]['message'] = 'changed conversation name '.($oldname != '' ? 'from \''.$oldname.'\' ' : '').'to \''.$newname.'\'';
break;
case 'HANGOUT_EVENT':
switch ($this_event['hangout_event']['event_type']) {
case 'START_HANGOUT':
$return[$a]['messages'][$k]['message'] = 'started a video chat';
break;
case 'END_HANGOUT':
$return[$a]['messages'][$k]['message'] = 'ended a video chat';
break;
default:
$return[$a]['messages'][$k]['message'] = $this_event['hangout_event']['event_type'];
}
break;
case 'SMS':
case 'REGULAR_CHAT_MESSAGE':
$return[$a]['messages'][$k]['message'] = "";
$msg = "";
$msghtml = "";
// join message segments together
if (isset($this_event['chat_message']['message_content']['segment'])) {
foreach ($this_event['chat_message']['message_content']['segment'] as $num=>$event) {
if (!isset($event['text'])) continue;
if ($event['type'] == 'TEXT') {
$msg .= $event['text'];
} else if ($event['type'] == 'LINK') {
$msg .= $event['text'];
// use the original text as link if it is a valid url, else use the link target
$link = (filter_var($event['text'], FILTER_VALIDATE_URL
) ?
$event['text'] : $event['link_data']['link_target']);
$msghtml .= '<a href="'.$link.'" target="_blank">'.$event['text'].'</a>';
} else if ($event['type'] == 'LINE_BREAK') {
$msg .= $event['text'];
}
}
}
// handle attachments
else if (isset($this_event['chat_message']['message_content']['attachment'])) {
// loop through attachments
foreach ($this_event['chat_message']['message_content']['attachment'] as $att) {
//echo "<pre>";print_r($att);echo "</pre>";
if ($att['embed_item']['type'][0] == 'PLUS_PHOTO') {
$imgurl = ($version == 1 ? $att['embed_item']['embeds.PlusPhoto.plus_photo']['url'] : $att['embed_item']['plus_photo']['url']);
$msg .= $imgurl;
$msghtml .= '<a href="'.$imgurl.'" target="_blank"><img src="'.$imgurl.'" alt="attached image" style="max-width:400px;max-height:400px;" title="'.htmlspecialchars(urldecode(preg_replace('/^.*\//','',$imgurl))).'"></a>';
}
}
}
// replace unicode emoticon characters by smileys
$return[$a]['messages'][$k]['message'] = replaceSmileys($msg);
if ($msg != $msghtml) { $return[$a]['messages'][$k]['message_html'] = replaceSmileys($msghtml); }
break;
case 'ADD_USER':
$newuserid = $this_event['membership_change']['participant_id'][0]['chat_id'];
$newusername = (isset($return[$a]['members'][$newuserid]) ?
$return[$a]['members'][$newuserid] : 'unknown_'.$newuserid);
$return[$a]['messages'][$k]['message'] = 'added user \''.$newusername.'\' to conversation';
break;
case 'REMOVE_USER':
$newuserid = $this_event['membership_change']['participant_id'][0]['chat_id'];
$newusername = (isset($return[$a]['members'][$newuserid]) ?
$return[$a]['members'][$newuserid] : 'unknown_'.$newuserid);
$return[$a]['messages'][$k]['message'] = 'removed user \''.$newusername.'\' from conversation';
break;
case 'OTR_MODIFICATION':
$return[$a]['messages'][$k]['message'] = 'unknown OTR_MODIFICATION';
break;
case 'VOICEMAIL':
$return[$a]['messages'][$k]['message'] = "new voicemail:\n";
// join message segments together
if (isset($this_event['chat_message']['message_content']['segment'])) {
for ($l = 0; $l < sizeof($this_event['chat_message']['message_content']['segment']); $l++) {
if (!isset($this_event['chat_message']['message_content']['segment'][$l]['text'])) continue;
$return[$a]['messages'][$k]['message'] .= $this_event['chat_message']['message_content']['segment'][$l]['text'];
}
}
// replace unicode emoticon characters by smileys
$return[$a]['messages'][$k]['message'] = replaceSmileys($return[$a]['messages'][$k]['message']);
break;
}
}
// reduce memory footprint
// sort messages by timestamp because for some reason they're cluttered
usort($return[$a]['messages'], function($a, $b) { return $a['timestamp'] - $b['timestamp']; });
}
if (sizeof($return) == 0) die('Error: no conversations found');
}