** ** Clean Compustat name file ** Bronwyn Hall, 11 Sep 2006 ** set more 1 global CSDIR "compstat" global NAMDIR "patdata/chptdata/names" cap log using rd_types,t replace insheet using $CSDIR/compustat_country_codes.txt,clear rename code ctrycode drop country rename uspto country label var country "Headquarters country from Compustat" sort ctrycode save temp,replace use $CSDIR/cshdr05,clear rename finc ctrycode sort ctrycode merge ctrycode using temp,nokeep tab _m drop _m replace country = "US" if ctrycode==. | ctrycode==0 drop ctrycode tab country ** Clean names rename coname name rename file csfile gen file = "CS" gen asstype = "firm" set seed 458925 gen list = uniform()>.999 gen standard_name = " "+trim(name)+" " /* so we can handle words at beg and end of string*/ replace standard_name=upper(standard_name) /*0 Special Compustat recoding */ replace standard_name = subinstr(standard_name,"-ADR"," ",30) replace standard_name = subinstr(standard_name,"-ADS"," ",30) replace standard_name = subinstr(standard_name,"-CL A "," ",30) replace standard_name = subinstr(standard_name,"-CL B "," ",30) replace standard_name = subinstr(standard_name,"-CONN "," ",30) replace standard_name = subinstr(standard_name,"-CONSOLIDATED "," ",30) replace standard_name = subinstr(standard_name,"-DEL "," ",30) replace standard_name = subinstr(standard_name,"-DE "," ",30) replace standard_name = subinstr(standard_name,"-NY SHARES "," ",30) replace standard_name = subinstr(standard_name,"-OLD "," ",30) replace standard_name = subinstr(standard_name,"-ORD "," ",30) replace standard_name = subinstr(standard_name,"-PRE FASB "," ",30) replace standard_name = subinstr(standard_name,"-PRO FORM "," ",30) replace standard_name = subinstr(standard_name,"-PRO FORMA "," ",30) replace standard_name = subinstr(standard_name,"- PRO FORMA "," ",30) replace standard_name = subinstr(standard_name,"-PRO FORMA1 "," ",30) replace standard_name = subinstr(standard_name,"-PRO FORMA2 "," ",30) replace standard_name = subinstr(standard_name,"-PRO FORMA3 "," ",30) replace standard_name = subinstr(standard_name,"-REDH "," ",30) replace standard_name = subinstr(standard_name,"-SER A COM "," ",30) replace standard_name = subinstr(standard_name,"-SER A "," ",30) replace standard_name = subinstr(standard_name,"-SPN "," ",30) replace standard_name = subinstr(standard_name," ACCPTNCE "," ACCEPTANCE ",30) replace standard_name = subinstr(standard_name," BANCORPORATION "," BANCORP ",30) replace standard_name = subinstr(standard_name," BANCORPORTN "," BANCORP ",30) replace standard_name = subinstr(standard_name," BANCRP "," BANCORP ",30) replace standard_name = subinstr(standard_name," BNCSHRS "," BANCSHARES ",30) replace standard_name = subinstr(standard_name," BRWG "," BREWING ",30) replace standard_name = subinstr(standard_name," CHEVRONTEXACO "," CHEVRON TEXACO ",30) replace standard_name = subinstr(standard_name," CHSE "," CHASE ",30) replace standard_name = subinstr(standard_name," COMMN "," COMMUNICATION ",30) replace standard_name = subinstr(standard_name," COMMUN "," COMMUNICATION ",30) replace standard_name = subinstr(standard_name," COMMUNICATNS "," COMMUNICATION ",30) replace standard_name = subinstr(standard_name," COMMUNICATIONS "," COMMUNICATION ",30) replace standard_name = subinstr(standard_name," DPT STS "," DEPT STORES ",30) replace standard_name = subinstr(standard_name," DPT "," DEPT ",30) replace standard_name = subinstr(standard_name," ENRGY "," ENERGY ",30) replace standard_name = subinstr(standard_name," FINL "," FINANCIAL ",30) replace standard_name = subinstr(standard_name," FNCL "," FINANCIAL ",30) replace standard_name = subinstr(standard_name," GRP "," GROUP ",30) replace standard_name = subinstr(standard_name," HLDGS "," HOLDINGS ",30) replace standard_name = subinstr(standard_name," HLDG "," HOLDING ",30) replace standard_name = subinstr(standard_name," HLT NTWK "," HEALTH NETWORK ",30) replace standard_name = subinstr(standard_name," HTLS RES "," HOTELS & RESORTS ",30) replace standard_name = subinstr(standard_name," HLTH "," HEALTH ",30) replace standard_name = subinstr(standard_name," INTRTECHNLGY "," INTERTECHNOLOGY ",30) replace standard_name = subinstr(standard_name," JPMORGAN "," J P MORGAN ",30) replace standard_name = subinstr(standard_name," MED OPTIC "," MEDICAL OPTICS ",30) replace standard_name = subinstr(standard_name," MINNESOTA MINING AND MANUFACTURING COMPANY "," 3M COMPANY ",30) replace standard_name = subinstr(standard_name," NAT RES "," NATURAL RESOURCES ",30) replace standard_name = subinstr(standard_name," NETWRKS "," NETWORK ",30) replace standard_name = subinstr(standard_name," PHARMACTICALS "," PHARM ",30) replace standard_name = subinstr(standard_name," PHARMACT "," PHARM ",30) replace standard_name = subinstr(standard_name," PPTYS TST "," PROPERTIES TRUST ",30) replace standard_name = subinstr(standard_name," PPTY "," PROPERTY ",30) replace standard_name = subinstr(standard_name," PROPERTY TR "," PROPERTY TRUST ",30) replace standard_name = subinstr(standard_name," PAC RAILWY "," PACIFIC RAILWAY ",30) replace standard_name = subinstr(standard_name," SEMICONDTR "," SEMICONDUCTOR ",30) replace standard_name = subinstr(standard_name," SOLU "," SOLUTIONS ",30) replace standard_name = subinstr(standard_name," ST & ALMN "," STEEL & ALUMINUM ",30) replace standard_name = subinstr(standard_name," STD "," STANDARD ",30) replace standard_name = subinstr(standard_name," TECHNOLGS "," TECH ",30) replace standard_name = subinstr(standard_name," TECHNOL "," TECH ",30) replace standard_name = subinstr(standard_name," TRANSPORTATN "," TRANSPORTATION ",30) * replace standard_name=" U.S. PHILIPS CORPORATION " if trim(standard_name)=="NORTH AMERICAN PHILIPS CORP" replace standard_name=" A. L. WILLIAMS CORP. " if trim(standard_name)=="WILLIAMS (A.L.) CORP" replace standard_name=" B. F. GOODRICH CO. " if trim(standard_name)=="GOODRICH CORP" replace standard_name=" BELL + HOWELL COMPANY " if trim(standard_name)=="BELL & HOWELL OPERATING CO" replace standard_name=" BENDIX CORPORATION(NOW ALLIED-SIGNAL INC.) " if trim(standard_name)=="BENDIX CORP" replace standard_name=" BORG-WARNER CORPORATION " if trim(standard_name)=="BORGWARNER INC" replace standard_name=" CHRYSLER MOTORS CORPORATION " if trim(standard_name)=="CHRYSLER CORP" replace standard_name=" CISCO TECHNOLOGY, INC. " if trim(standard_name)=="CISCO SYSTEMS INC" replace standard_name=" DELL PRODUCTS, L.P. " if trim(standard_name)=="DELL INC" replace standard_name=" DELPHI TECHNOLOGIES, INC. " if trim(standard_name)=="DELPHI CORP" replace standard_name=" E. I. DU PONT DE NEMOURS AND COMPANY " if trim(standard_name)=="DU PONT (E I) DE NEMOURS" replace standard_name=" E. R. SQUIBB + SONS, INC. " if trim(standard_name)=="SQUIBB CORP" replace standard_name=" ELI LILLY AND COMPANY " if trim(standard_name)=="LILLY (ELI) & CO" replace standard_name=" G. D. SEARLE & CO. " if trim(standard_name)=="SEARLE (G.D.) & CO" replace standard_name=" MINNESOTA MINING AND MANUFACTURING COMPANY " if trim(standard_name)=="3M CO" replace standard_name=" OWENS-CORNING FIBERGLAS CORPORATION " if trim(standard_name)=="OWENS CORNING" replace standard_name=" SCHLUMBERGER TECHNOLOGY CORPORATION " if trim(standard_name)=="SCHLUMBERGER LTD" replace standard_name=" SCI-MED LIFE SYSTEMS, INC. " if trim(standard_name)=="SICMED LIFE SYSTEMS" replace standard_name=" TDK CORPORATION " if trim(standard_name)=="TDK CORP" replace standard_name=" UNITED STATES SURGICAL CORPORATION " if trim(standard_name)=="U S SURGICAL CORP" replace standard_name=" W. R. GRACE & CO. " if trim(standard_name)=="GRACE (W R) & CO" replace standard_name=" WESTINGHOUSE ELECTRIC CORP. " if trim(standard_name)=="WESTINGHOUSE ELEC" /*1*/ do $NAMDIR/punctuation list name country standard_name if list,clean compress noobs str(35) /*2*/ qui do $NAMDIR/standard_name list name country standard_name if list,clean compress noobs str(35) list name standard_name if length(name)>100,clean compress noobs str(40) /*3*/ qui do $NAMDIR/corporates /*4*/ qui do $NAMDIR/stem_name sort stem_name gen same = stem_name==stem_name[_n-1] tab same drop same /*5*/ qui do $NAMDIR/trim_and_strip sort stem_name gen same = stem_name==stem_name[_n-1] | stem_name==stem_name[_n+1] tab same list name stem_name if same,clean compress str(40) noobs label var stem_name "Std name excluding inc, co, etc" label var standard_name "Standardized name of firm" drop file asstype list same rename name cname aorder compress sort stem_name egen nsame = count(first),by(stem) tab nsame label var nsame "N with same stem name on Compustat" sort stem last keep if stem~=stem[_n+1] /* remove earlier duplicate stem names */ desc save $CSDIR/cshdr05_std,replace log close