by Jay2k1 • 3 years ago • PHP
  1. <?php
  2. /*
  3. This is a function that transforms the JSON you get from Google Takeout when you export your Hangouts history
  4. into a PHP array which can be used to further manipulate the data.
  5.  
  6. A use case is my hangouts parser at http://hangoutparser.jay2k1.com/ -- a description can be seen at
  7. http://blog.jay2k1.com/2014/11/10/how-to-export-and-backup-your-google-hangouts-chat-history/
  8.  
  9. You feed the function with the JSON, and in return you get a nice array holding all the conversations.
  10.    
  11. As a parameter, it expects the full filename (including path) to the Hangouts.json file
  12. It returns an array in this format:
  13.  
  14. $array[dates]                               contains the days (YYYY-MM-DD) that have at least one message
  15. $array[0..N]                                array of conversations
  16. $array[0..N][name]                          conversation name. afaik only group chats can have one
  17. $array[0..N][type]                          conversation type. can be either STICKY_ONE_TO_ONE or GROUP
  18. $array[0..N][msg_count]                     message count for that conversation
  19. $array[0..N][members]                       array of conversation members where key = sender ID and value = sender name
  20. $array[0..N][dates]                         array of all dates of messages in this conversation
  21. $array[0..N][messages]                      array of messages
  22. $array[0..N][messages][0..N]                array with message details
  23. $array[0..N][messages][0..N][timestamp]     timestamp of message in unixtime (actually, unixtime plus six more digits)
  24. $array[0..N][messages][0..N][datetime]      timestamp of message in YYYY-MM-DD HH:MM:SS format
  25. $array[0..N][messages][0..N][sender_id]     google's chat ID of the message's sender
  26. $array[0..N][messages][0..N][sender]        name of the message's sender (the "from")
  27. $array[0..N][messages][0..N][event_type]    type of message/event. can be RENAME_CONVERSATION, HANGOUT_EVENT, REGULAR_CHAT_MESSAGE, ADD_USER, REMOVE_USER, SMS, OTR_MODIFICATION, VOICEMAIL and maybe more...
  28. $array[0..N][messages][0..N][message]       the actual message text
  29. $array[0..N][messages][0..N][message_html]  HTML version of message text, if applicable (links are clickable, images are embedded etc)
  30.  
  31. So you could call it like this: $my_conversations = hangoutsToArray('/tmp/hangouts.json');
  32.  
  33. */
  34.  
  35. // stop notices from showing
  36. //error_reporting(E_ALL ^ E_NOTICE);
  37.  
  38. function replaceSmileys($string) {
  39.     // replaces UTF-8 graphical emoticons by their ASCII equivalents
  40.     // list of emoji codes taken from https://aprescott.com/posts/hangouts-emoji
  41.     $patterns = array(
  42.         '/\x{1F41D}/u',         // -<@% ?       honeybee
  43.         '/\x{1F435}/u',         // :(|) ?       monkey face
  44.         '/\x{1F437}/u',         // :(:) ?       pig face
  45.         '/\x{1F473}/u',         // (]:{ ?       man with turban
  46.         '/\x{1F494}/u',         // <\3 </3      ?       broken heart
  47.         '/\x{1F49C}/u',         // <3   ?       purple heart
  48.         '/\x{1F4A9}/u',         // ~@~  ?       pile of poo
  49.         '/\x{1F600}/u',         // :D :-D       ?       grinning face
  50.         '/\x{1F601}/u',         // ^_^  ?       grinning face with smiling eyes
  51.         '/\x{1F602}/u',         // XD
  52.         '/\x{1F603}/u',         // :) :-) =)    ?       smiling face with open mouth
  53.         '/\x{1F604}/u',         // =D   ?       smiling face with open mouth and smiling eyes
  54.         '/\x{1F605}/u',         // ^_^;;        ?       smiling face with open mouth and cold sweat
  55.         '/\x{1F607}/u',         // O:) O:-) O=) ?       smiling face with halo
  56.         '/\x{1F608}/u',         // }:) }:-) }=) ?       smiling face with horns
  57.         '/\x{1F609}/u',         // ;) ;-)       ?       winking face
  58.         '/\x{1F60E}/u',         // B) B-)       ?       smiling face with sunglasses
  59.         '/\x{1F610}/u',         // :-| :| =|    ?       neutral face
  60.         '/\x{1F611}/u',         // -_-  ?       expressionless face
  61.         '/\x{1F613}/u',         // o_o; ?       face with cold sweat
  62.         '/\x{1F614}/u',         // u_u  ?       pensive face
  63.         '/\x{1F615}/u',         // :\ :/ :-\ :-/ =\ =/  ?       confused face
  64.         '/\x{1F616}/u',         // :S :-S :s :-s        ?       confounded face
  65.         '/\x{1F617}/u',         // :* :-*       ?       kissing face
  66.         '/\x{1F618}/u',         // ;* ;-*       ?       face throwing a kiss
  67.         '/\x{1F61B}/u',         // :P :-P =P :p :-p =p  ?       face with stuck-out tongue
  68.         '/\x{1F61C}/u',         // ;P ;-P ;p ;-p        ?       face with stuck-out tongue and winking eye
  69.         '/\x{1F61E}/u',         // :( :-( =(    ?       disappointed face
  70.         '/\x{1F621}/u',         // >.< >:( >:-( >=(     ?       pouting face
  71.         '/\x{1F622}/u',         // T_T :'( ;_; ='(      ?       crying face
  72.         '/\x{1F623}/u',         // >_<  ?       persevering face
  73.         '/\x{1F626}/u',         // D:   ?       frowning face with open mouth
  74.         '/\x{1F62E}/u',         // o.o :o :-o =o        ?       face with open mouth
  75.         '/\x{1F632}/u',         // O.O :O :-O =O        ?       astonished face
  76.         '/\x{1F634}/u',         // O.O :O :-O =O        ?       astonished face
  77.         '/\x{1F635}/u',         // x_x X-O X-o X( X-(   ?       dizzy face
  78.         '/\x{1F638}/u'          // :X) :3 (=^..^=) (=^.^=) =^_^=        ?       grinning cat face with smiling eyes
  79.     );
  80.     $replacements = array(
  81.         '-<@%',
  82.         ':(|)',
  83.         ':(:)',
  84.         '(]:{',
  85.         '</3',
  86.         '<3',
  87.         '~@~',
  88.         ':D',
  89.         '^_^',
  90.         'XD',
  91.         ':)',
  92.         '=D',
  93.         '^_^;;',
  94.         'O:)',
  95.         '}:)',
  96.         ';)',
  97.         'B-)',
  98.         ':|',
  99.         '-_-',
  100.         'o_o;',
  101.         'u_u',
  102.         ':/',
  103.         ':S',
  104.         ':*',
  105.         ';*',
  106.         ':P',
  107.         ';P',
  108.         ':(',
  109.         '>.<',
  110.         ":'(",
  111.         '>_<',
  112.         'D:',
  113.         ':o',
  114.         ':O',
  115.         '-_-Zzz',
  116.         'x_x',
  117.         ':3'
  118.     );
  119.  
  120.     return preg_replace($patterns, $replacements, $string);
  121. }
  122.  
  123. /**
  124.  * string $jsonfile: file name and path of Hangouts.json (must be readable by the PHP user), e.g. '/tmp/Hangouts.json'
  125.  * int $version: 1 for Google's old format (until March 2018) (first line has "conversation_state"), 2 for newer version
  126.  */
  127. function hangoutsToArray($jsonfile, $version = 2) {
  128.     // set the desired timestamp format here
  129.     // the default is 'Y-m-d H:i:s' which is YYYY-MM-DD HH:mm:ss.
  130.     $timestamp_format = 'Y-m-d H:i:s';
  131.    
  132.     $handle = @fopen($jsonfile, "r");
  133.     if (!$handle) { die('failed to open input file '.$file.' for reading'); }
  134.  
  135.     $conversation_count = 0;
  136.     $local_linecount = 0;
  137.    
  138.     if ($version == 1) {
  139.         $startregex = '/"conversation_state" : {/';
  140.         $endregex   = '/^  }/';
  141.     } else {
  142.         $startregex = '/^    \{$/';
  143.         $endregex   = '/^    \},?$/';
  144.     }
  145.  
  146.     // we're going to read the whole file and split up the conversations into the $outfiles array
  147.     while (($buffer = fgets($handle, 4096)) !== false) {
  148.         $local_linecount++;
  149.        
  150.         // handle conversation ends
  151.         if (preg_match($endregex,$buffer) && isset($outhandle)) {
  152.             $outfiles[$conversation_count]['data'] .=  "}";
  153.             $outfiles[$conversation_count]['lines'] = $local_linecount;
  154.             unset($outhandle);
  155.         }
  156.  
  157.         // handle conversation starts
  158.         if (preg_match($startregex, $buffer)) {
  159.             $conversation_count++;
  160.             $local_linecount = 0;
  161.             $outhandle = true;
  162.             $outfiles[$conversation_count]['data'] = ($version == 1 ? "{\n" : "");
  163.             error_log("* reading conversation $conversation_count ...");
  164.         }
  165.        
  166.         // write data from conversation
  167.         if (isset($outhandle)) { $outfiles[$conversation_count]['data'] .= $buffer; }
  168.     }
  169.     fclose($handle);
  170.    
  171.     $total = sizeof($outfiles);
  172.     $return = array();
  173.     $cstate = ($version == 1 ? 'conversation_state' : 'conversation');
  174.    
  175.     // now we have a bunch of array entries, each containing one conversation. Now we're gonna read them, delete them and parse their contents.
  176.     for ($a = 1; $a <= $total; $a++) {
  177.  
  178.         error_log("* decoding JSON of conversation ".$a."/".$total."...");
  179.         $decoded = json_decode($outfiles[$a]['data'],true);
  180.  
  181.         // reduce memory footprint
  182.         $outfiles[$a]['data'] = '';
  183.  
  184.         // make sure there's a conversation inside
  185.         if (!isset($decoded[$cstate])) {
  186.             error_log('skipped empty conversation '.($a));
  187.             continue;
  188.         }  
  189.        
  190.         // first, get metadata
  191.         $return[$a]['type'] = $decoded[$cstate]['conversation']['type'];
  192.         $return[$a]['msgcount'] = ($version == 1 ? sizeof($decoded['conversation_state']['event']) : sizeof($decoded['events']));
  193.         $return[$a]['name'] = (isset($decoded[$cstate]['conversation']['name']) ? $decoded[$cstate]['conversation']['name'] : "");
  194.  
  195.         // conversation participants
  196.         foreach ($decoded[$cstate]['conversation']['participant_data'] as $participant) {
  197.             $id = $participant['id']['chat_id'];
  198.             // use "unknown_<chat_id>" as name if they don't have a fallback_name
  199.             $name = (isset($participant['fallback_name']) ? $participant['fallback_name'] : 'unknown_'.$id);
  200.             $return[$a]['members'][$id] = $name;
  201.         }
  202.  
  203.         // loop through messages/events
  204.         $return[$a]['messages'] = array();
  205.         $lastcount = 0;
  206.         for ($k = 0; $k < $return[$a]['msgcount']; $k++) {
  207.             $this_event = ($version == 1 ? $decoded['conversation_state']['event'][$k] : $decoded['events'][$k]);
  208.  
  209.             // get (unixtime) timestamp
  210.             $return[$a]['messages'][$k]['timestamp'] = $this_event['timestamp'];
  211.  
  212.             // get (human readable) timestamp
  213.             $return[$a]['messages'][$k]['datetime'] = date($timestamp_format,substr($this_event['timestamp'], 0, 10));
  214.  
  215.             // get sender id
  216.             $return[$a]['messages'][$k]['sender_id'] = $this_event['sender_id']['chat_id'];
  217.  
  218.             // get sender name
  219.             $return[$a]['messages'][$k]['sender'] = (isset($return[$a]['members'][$return[$a]['messages'][$k]['sender_id']]) ? $return[$a]['members'][$return[$a]['messages'][$k]['sender_id']] : 'unknown_'.$id);
  220.  
  221.             // get event type
  222.             $return[$a]['messages'][$k]['event_type'] = $this_event['event_type'];
  223.  
  224.             // handle event
  225.             switch ($return[$a]['messages'][$k]['event_type']) {
  226.                 case 'RENAME_CONVERSATION':
  227.                     $newname = $this_event['conversation_rename']['new_name'];
  228.                     $oldname = $this_event['conversation_rename']['old_name'];
  229.                     $return[$a]['messages'][$k]['message'] = 'changed conversation name '.($oldname != '' ? 'from \''.$oldname.'\' ' : '').'to \''.$newname.'\'';
  230.                     break;
  231.  
  232.                 case 'HANGOUT_EVENT':
  233.                     switch ($this_event['hangout_event']['event_type']) {
  234.                         case 'START_HANGOUT':
  235.                             $return[$a]['messages'][$k]['message'] = 'started a video chat';
  236.                             break;
  237.                         case 'END_HANGOUT':
  238.                             $return[$a]['messages'][$k]['message'] = 'ended a video chat';
  239.                             break;
  240.                         default:
  241.                             $return[$a]['messages'][$k]['message'] = $this_event['hangout_event']['event_type'];
  242.  
  243.                     }
  244.                     break;
  245.  
  246.                 case 'SMS':                    
  247.                 case 'REGULAR_CHAT_MESSAGE':
  248.                     $return[$a]['messages'][$k]['message'] = "";
  249.                     $msg = "";
  250.                     $msghtml = "";
  251.                     // join message segments together
  252.                     if (isset($this_event['chat_message']['message_content']['segment'])) {
  253.                         foreach ($this_event['chat_message']['message_content']['segment'] as $num=>$event) {
  254.                             if (!isset($event['text'])) continue;
  255.                             if ($event['type'] == 'TEXT') {
  256.                                 $msg .= $event['text'];
  257.                                 $msghtml .= preg_replace('/\n/','<br>',$event['text']);
  258.                             } else if ($event['type'] == 'LINK') {
  259.                                 $msg .= $event['text'];
  260.                                 // use the original text as link if it is a valid url, else use the link target
  261.                                 $link = (filter_var($event['text'], FILTER_VALIDATE_URL) ? $event['text'] : $event['link_data']['link_target']);
  262.                                 $msghtml .= '<a href="'.$link.'" target="_blank">'.$event['text'].'</a>';
  263.                             } else if ($event['type'] == 'LINE_BREAK') {
  264.                                 $msg .= $event['text'];
  265.                                 $msghtml .= preg_replace('/\n/','<br>',$event['text']);
  266.                             }
  267.                         }
  268.                     }
  269.                     // handle attachments
  270.                     else if (isset($this_event['chat_message']['message_content']['attachment'])) {
  271.                         // loop through attachments
  272.                         foreach ($this_event['chat_message']['message_content']['attachment'] as $att) {
  273.                             //echo "<pre>";print_r($att);echo "</pre>";
  274.                             if ($att['embed_item']['type'][0] == 'PLUS_PHOTO') {
  275.                                 $imgurl = ($version == 1 ? $att['embed_item']['embeds.PlusPhoto.plus_photo']['url'] : $att['embed_item']['plus_photo']['url']);
  276.                                 $msg .= $imgurl;
  277.                                 $msghtml .= '<a href="'.$imgurl.'" target="_blank"><img src="'.$imgurl.'" alt="attached image" style="max-width:400px;max-height:400px;" title="'.htmlspecialchars(urldecode(preg_replace('/^.*\//','',$imgurl))).'"></a>';
  278.                             }
  279.                         }
  280.                     }
  281.                     // replace unicode emoticon characters by smileys
  282.                     $return[$a]['messages'][$k]['message'] = replaceSmileys($msg);
  283.                     if ($msg != $msghtml) { $return[$a]['messages'][$k]['message_html'] = replaceSmileys($msghtml); }
  284.                     break;
  285.  
  286.                 case 'ADD_USER':
  287.                     $newuserid = $this_event['membership_change']['participant_id'][0]['chat_id'];
  288.                     $newusername = (isset($return[$a]['members'][$newuserid]) ? $return[$a]['members'][$newuserid] : 'unknown_'.$newuserid);
  289.                     $return[$a]['messages'][$k]['message'] = 'added user \''.$newusername.'\' to conversation';
  290.                     break;
  291.  
  292.                 case 'REMOVE_USER':
  293.                     $newuserid = $this_event['membership_change']['participant_id'][0]['chat_id'];
  294.                     $newusername = (isset($return[$a]['members'][$newuserid]) ? $return[$a]['members'][$newuserid] : 'unknown_'.$newuserid);
  295.                     $return[$a]['messages'][$k]['message'] = 'removed user \''.$newusername.'\' from conversation';
  296.                     break;
  297.                    
  298.                 case 'OTR_MODIFICATION':
  299.                     $return[$a]['messages'][$k]['message'] = 'unknown OTR_MODIFICATION';
  300.                     break;
  301.  
  302.                 case 'VOICEMAIL':
  303.                     $return[$a]['messages'][$k]['message'] = "new voicemail:\n";
  304.                     // join message segments together
  305.                     if (isset($this_event['chat_message']['message_content']['segment'])) {
  306.                         for ($l = 0; $l < sizeof($this_event['chat_message']['message_content']['segment']); $l++) {
  307.                             if (!isset($this_event['chat_message']['message_content']['segment'][$l]['text'])) continue;
  308.                             $return[$a]['messages'][$k]['message'] .= $this_event['chat_message']['message_content']['segment'][$l]['text'];
  309.                         }
  310.                     }
  311.                     // replace unicode emoticon characters by smileys
  312.                     $return[$a]['messages'][$k]['message'] = replaceSmileys($return[$a]['messages'][$k]['message']);
  313.                     break;
  314.             }
  315.         }
  316.  
  317.         // reduce memory footprint
  318.         unset($decoded);
  319.        
  320.         // sort messages by timestamp because for some reason they're cluttered
  321.         usort($return[$a]['messages'], function($a, $b) { return $a['timestamp'] - $b['timestamp']; });
  322.  
  323.         error_log('mem: '.round((memory_get_usage() / 1024 / 1024), 1).'MB used/'.round((memory_get_usage(true) / 1024 / 1024), 1).'MB real -- parsed convo '.$a.'/'.$total.' ('.$return[$a]['msgcount'].' msgs)');
  324.        
  325.     }
  326.     error_log('mem: '.round((memory_get_usage() / 1024 / 1024), 1).'MB used/'.round((memory_get_usage(true) / 1024 / 1024), 1).'MB real -- done.');
  327.     if (sizeof($return) == 0) die('Error: no conversations found');
  328.  
  329.     return array_values($return);
  330. }
  331.  

Replies to Hangout Parser function rss

Title Name Language When
Re: Hangout Parser function Stained Prairie Dog javascript 2 months ago.
Re: Hangout Parser function Botched Water Vole javascript 2 months ago.