
    
BjTC                       U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZmZmZmZ i Zded<   dD ]  Z eeed      Zeee ee      <    dd	Zdd
ZddZddZdddZdd	 	 	 	 	 	 	 ddZddZddZddZddZy)u  Shutdown forensics — capture context when the gateway receives SIGTERM/SIGINT.

The gateway's ``shutdown_signal_handler`` runs synchronously inside the
asyncio event loop.  We can't safely block it for long, but we DO want a
durable record of who/what triggered the shutdown so that "the gateway
keeps dying" incidents can be diagnosed after the fact.

This module exposes :func:`snapshot_shutdown_context`, a fast (<10ms),
non-blocking probe that returns a structured dict the signal handler can
log immediately, plus :func:`spawn_async_diagnostic`, a fire-and-forget
``ps`` walk that runs as a detached subprocess so it can't block teardown
even if /proc is wedged.

Anything that needs to wait (e.g. shelling out to ``ps aux``) belongs in
the async helper, never in the synchronous probe.
    )annotationsN)Path)AnyDictListOptionalzDict[int, str]_SIGNAL_NAME_BY_NUM)SIGTERMSIGINTSIGHUPSIGQUITSIGUSR1SIGUSR2c                    | y	 t        |       }t        j                  |d|       S # t        t        f$ r t        |       cY S w xY w)zBReturn a human-readable signal name (or ``str(sig)`` as fallback).UNKNOWNzsignal#)int	TypeError
ValueErrorstrr	   get)sigsig_ints     ?/home/ubuntu/.hermes/hermes-agent/gateway/shutdown_forensics.py_signal_namer   %   sS    
{c( ""7ggY,?@@ z" 3xs   ) AAc                   	 t        d|  dd      5 }|D ]D  }|j                  |dz         s|j                  dd      d   j                         c cddd       S  	 ddd       y# 1 sw Y   yxY w# t        t
        t        f$ r Y yw xY w)zIRead a single field from /proc/<pid>/status.  Linux only; None elsewhere./proc/z/statusutf-8encoding:   N)open
startswithsplitstripFileNotFoundErrorPermissionErrorOSError)pidkeyfhlines       r   _read_proc_fieldr-   0   s    F3%w'': 	9b 9??39-::c1-a06688	9 	99	9 	9  8 s?   A3 A'$A'	A3 A'A3 'A0,A3 0A3 3B
	B
c                
   	 t        d|  dd      5 }|j                         }ddd       sy|j                  dd      j                  dd	      j                         S # 1 sw Y   =xY w# t        t        t        f$ r Y yw xY w)
zLRead /proc/<pid>/cmdline as a printable string.  Linux only; None elsewhere.r   z/cmdlinerbN        r   replace)errors)r"   readr&   r'   r(   r2   decoder%   )r)   r+   datas      r   _read_proc_cmdliner7   <   s    F3%x($/ 	2779D	 <<&--gi-HNNPP	 	8 s'   A+ AA+ A($A+ +BBc                :   d| i}| dk  r|S t        | d      }|||d<   t        | d      }|||d<   t        | d      }|	 t        |      |d	<   t        | d
      }||r|j                         d   n||d<   t	        |       }|r|dd |d<   |S # t        $ r Y Jw xY w)zCompact /proc/<pid> snapshot: pid, ppid, state, uid, cmdline.

    Best-effort.  Missing fields are simply omitted rather than raising.
    r)   r   NameNnameStatestatePPidppidUiduid,  cmdline)r-   r   r   r$   r7   )r)   summaryr:   r<   r>   r@   rB   s          r   _proc_summaryrD   I   s    
  %clG
axC(DS'*E C(D	!$iGFO 3
&C
+.QC %G$Tc]	N  		s   B 	BBc           
        t        j                          }t        j                         }t        j                         }t        j                         }||t        |       | t        |       nd||t        |      t        |      d}t        j                  j                  d      }|r||d<   t        j                  j                  d      }|r||d<   t        |      xs |dk(  |d<   	 t        j                         d	   |d
<   	 t        |d      }|N|dk7  rI|j                         rt        |      n||d<   |j                         rt        t        |            nd|d<   	 t        j                  j                  d      }	|	rt%        |	      dz  }
|
j'                         r.	 |
j)                  d      }|dd |d<   d| |v xs d| |v |d<   t%        |	      dz  }|j'                         r	 |j)                  d      }|dd |d<   |S |S # t        t        f$ r Y w xY w# t         t"        f$ r Y w xY w# t        $ r Y sw xY w# t        $ r Y |S w xY w# t*        $ r Y |S w xY w)a'  Fast (<10ms) snapshot of who/what is asking us to shut down.

    Captures:

    * The signal number/name (so SIGINT vs SIGTERM is visible)
    * Our own PID/ppid + parent process info from /proc (Linux)
    * Whether systemd is our parent (``ppid==1`` or ``INVOCATION_ID`` set)
    * Whether takeover/planned-stop markers exist (consumed lazily by the caller)
    * /proc/self limits + load average (1-min)
    * Wall-clock and monotonic timestamps for cross-correlating later phases

    Pure stdlib, never raises, never blocks on subprocesses.
    N)tsts_monotonicsignal
signal_numr)   r>   parentselfINVOCATION_IDsystemd_invocation_idJOURNAL_STREAMsystemd_journal_streamr!   under_systemdr   
loadavg_1m	TracerPid0
tracer_pidtracerHERMES_HOMEz.gateway-takeover.jsonr   r   rA   takeover_markerz"target_pid": z'target_pid': takeover_marker_for_selfz.gateway-planned-stop.jsonplanned_stop_marker)time	monotonicosgetpidgetppidr   r   rD   environr   bool
getloadavgr(   AttributeErrorr-   isdigitr   r   r   exists	read_text	Exception)received_signalnowr[   r)   r>   ctxinvocation_idjournal_streamrU   hermes_home_strtakeover_pathrawplanned_stop_paths                r   snapshot_shutdown_contextrp   h   s    ))+C I
))+C::<D !/.=.Ic/*t%c"	C JJNN?3M'4#$ZZ^^$45N(6$%.;$!)CMMOA.L!#{3&C-/5~~/?FVC:@..:JM#f+6PTCM**..7 14LLM##%'1171CC-0#YC)*(.#5 9+C51S8 23 !%_ 58T T '')+55w5GC14TcC-. J3J[ ^$  z" ,     J Jsy   &H AH ?I -H3 I *I HHH0/H03	H?<I >H??I 	II II 	IIg      @)timeout_secondsc                  	 | j                   j                  dd       t        j                  dk(  ryd| dt        j                          d}	 t        j                  t        |       t
        j                  t
        j                  z  t
        j                  z  d      }	 t        j                  d	|d
dd|g|t        j                  t        j                  dd      }	 	 t        j"                  |       |j$                  S # t        $ r Y yw xY w# t        $ r Y yw xY w# t         t        f$ rN 	 t        j"                  |       n# t        $ r Y nw xY wY 	 t        j"                  |       y# t        $ r Y yw xY ww xY w# t        $ r Y |j$                  S w xY w# 	 t        j"                  |       w # t        $ r Y w w xY wxY w)a  Fire-and-forget ``ps``-style snapshot written to ``log_path``.

    Runs as a detached subprocess so it can't block the asyncio event loop
    or compete with platform teardown.  The subprocess uses its own
    ``timeout`` so a wedged ``ps`` still self-cleans within
    ``timeout_seconds``.

    Returns the subprocess PID on success, ``None`` on failure.  Never
    raises.

    We deliberately avoid ``subprocess.run(["ps", "aux"])`` from inside the
    signal handler (the pre-existing pattern): on a busy host with hundreds
    of processes, ``ps aux`` can take >2s to walk /proc, during which the
    asyncio loop is frozen and adapter teardown can't begin.
    T)parentsexist_okNwin32z echo '=== shutdown diagnostic @ z ==='; echo '--- date ---'; date -u +%Y-%m-%dT%H:%M:%SZ; echo '--- ps auxf (top 60 by cpu) ---'; ps auxf --sort=-pcpu 2>/dev/null | head -60; echo '--- pstree of self ---'; pstree -plau a   2>/dev/null | head -40 || true; echo '--- /proc/loadavg ---'; cat /proc/loadavg 2>/dev/null || true; echo '--- recent dmesg (oom/killed) ---'; dmesg -T 2>/dev/null | tail -20 || journalctl --user -n 20 --no-pager 2>/dev/null | tail -20 || true; echo '=== end ==='i  timeoutz.0fbashz-c)stdoutstderrstdinstart_new_session	close_fds)rJ   mkdirr(   sysplatformr\   r]   r"   r   O_WRONLYO_CREATO_APPEND
subprocessPopenSTDOUTDEVNULLr&   closer)   )log_pathsignal_namerq   scriptfdprocs         r   spawn_async_diagnosticr      s   *dT: ||w +;- 8
 		} %
	  WWS]BKK"**$<r{{$JER ?3/&$G$$$$"
 	HHRL 88Ou  8  " w' 	HHRL 			HHRL 		  	88O		HHRL 		s   D AD  >D  E? 	DD	DDE</EE<	EE<EE<F E, ,	E87E8;E<<F ?	FFG F10G 1	F=:G <F==G c                   | j                  dd      }| j                  d      xs i }|j                  dd      }|j                  d      xs d}|j                  d      xs d}| j                  d      rd	nd
}| j                  d      }t        |t        t        f      r|dnd}g }	| j                  d      )| j                  d      }
|	j	                  d|
rdnd        | j                  d      |	j	                  d       | j                  d      r|	j	                  d| d           |	rddj                  |	      z   nd}d| d| d| d| d| | d|S )z?Render a shutdown context dict as a single, scannable log line.rH   ?rJ   rB   z	(unknown)r:   r)   rP   yesnorQ   z.2frW   rX   ztakeover_marker_present=rK   otherrY   zplanned_stop_marker_present=yesrT   ztracer_pid=  zsignal=z under_systemd=z parent_pid=z parent_name=z loadavg_1m=z parent_cmdline=)r   
isinstancer   floatappendjoin)ri   r   rJ   
parent_cmdparent_name
parent_pidrP   loadload_strextrasfor_self
extras_strs               r   format_context_for_logr     sw   
''(C
 CWWX$"FI{3J**V$+KE")cJ WW_5E4M77< D *4#u >$sCHF
ww !-7756&vw&GH	
 ww$%178
ww|C$5#678-3#((J # & ( \ ""m $Z, $	)    c                f    	 t        j                  | t        d      S # t        t        f$ r Y yw xY w)zFJSON-serialise a context dict for structured ingestion.  Never raises.T)default	sort_keysz{})jsondumpsr   r   r   )ri   s    r   context_as_jsonr   :  s1    zz#sd;;z" s    00c                j   t         j                  j                  d      }|syd}	 t        dd      5 }|D ]O  }d|v s|j	                         j                  d      }t        |      D ]  }|j                  d      s|} n |sO n ddd       |syd}dgg fD ]  }	 t        j                  d	g|d
|dddd      }	|	j                  dk7  r6|	j                  j                         D ]b  }|j!                  d      s|j                  dd      d   j	                         }
|
j#                         rt%        |
      }nt'        |
      }|b n | n |y|dz  }d}| |z   }||| |||k  dS # 1 sw Y   xY w# t        t        f$ r Y w xY w# t        t        j                  t        f$ r Y w xY w)u.  At startup, sanity-check that systemd's TimeoutStopSec >= drain_timeout.

    When the gateway is run under a stale systemd unit file (e.g. the user
    upgraded hermes-agent but never re-ran ``hermes setup`` to regenerate
    the unit), ``TimeoutStopSec`` can be smaller than the configured
    ``restart_drain_timeout``.  Result: SIGTERM arrives, the drain starts,
    and systemd SIGKILLs the cgroup mid-drain — looks like a phantom kill
    in the journal because the journal only logs ``code=killed status=9``.

    Returns ``None`` when the alignment is fine OR we can't determine it
    (not running under systemd, ``systemctl`` unavailable, etc.).  Returns
    a dict with ``timeout_stop_sec`` + ``drain_timeout`` + ``mismatch``
    bool when we have data to report.

    Best-effort.  Never raises.
    rL   Nz/proc/self/cgroupr   r   z.service/z--user	systemctlshowz--property=TimeoutStopUSecTg       @)capture_outputtextrv   r   zTimeoutStopUSec==r!   g    .Ag      >@)unittimeout_stop_secdrain_timeoutexpected_minmismatch)r\   r_   r   r"   r%   r$   reversedendswithr(   r&   r   runTimeoutExpired
returncoderx   
splitlinesr#   rc   r   _parse_systemd_duration_to_us)r   rj   	unit_namer+   r,   partsp
timeout_usflagresultvaluer   headroomexpecteds                 r   check_systemd_timing_alignmentr   B  s	   " JJNN?3M  $I%8 
	B 	% JJL..s3E%e_ "::j1()I!" !	
	 
 !%JR  	^^UtUVUYU8TU#$F !MM,,. 		D12

3*1-335==?!$UJ!>u!EJ)		 !-0 !K/ Hx'H,& $x/ k
	 
	 &'  ":#<#<gF 		sK   E; 
E/>E/=E/E/E; "F/E84E; ;FFF21F2c                p   | sydddddddd}d}d	}d	}| d
z   D ]  }|j                         s|dk(  rL|rD|j                  |j                               }||s y	 |t        t	        |      |z        z  }d	}d	}||z  }d|j                         r||z  }z|rE|rC|j                  |j                               }| y	 |t        t	        |      |z        z  }d	}d	}|s|r	 |t        t	        |      dz        z  }d	} |dkD  r|S dS # t
        $ r Y  yw xY w# t
        $ r Y  yw xY w# t
        $ r Y  yw xY w)zParse 'TimeoutStopUSec=1min 30s' / '90s' style values to microseconds.

    systemd accepts a wide grammar; we cover the common cases (s, ms, min,
    h) and return None on anything unexpected.  Never raises.
    Nr!   i  i@B i l    $'- )usmsssecminhhrr   r   r   .)rc   r   lowerr   r   r   isalpha)rn   unitstotal_ustokendigitsch
multipliers          r   r   r     s    E HEFCi  ::<29"YYu{{}5
%V E&MJ$> ??H bLFZZ\RKE5;;=1J!Cf
 :;; FEECf	 9:: FA B  !|8--1 "       s6   D;D!D(	DD	D%$D%(	D54D5)r   r   returnr   )r)   r   r*   r   r   Optional[str])r)   r   r   r   )r)   r   r   Dict[str, Any])N)rg   r   r   r   )r   r   r   r   rq   r   r   Optional[int])ri   r   r   r   )r   r   r   zOptional[Dict[str, Any]])rn   r   r   r   ) __doc__
__future__r   r   r\   rH   r   r~   rZ   pathlibr   typingr   r   r   r   r	   __annotations___namegetattr_valr   r   r-   r7   rD   rp   r   r   r   r   r    r   r   <module>r      s   " #  	   
   , , ') ^ (M /E65$'D).CI&/A	
Q>ZB !	QQQ 	Q
 QhBTn5.r   