메모리 부족 issue

HQ 파티션 실행 중 지속적인 중단 & 에러 발생

#Error code
hive> INSERT overwrite TABLE partition_table PARTITION(key)
    > SELECT
    > fare_amount
    > ,from_unixtime(unix_timestamp(pickup_datetime, 'yyyy-MM-dd HH:mm:ss'))
    > ,pickup_longitude
    > ,pickup_latitude
    > ,dropoff_longitude
    > ,dropoff_latitude
    > ,passenger_count
    > ,SUBSTRING(key,1,7) AS key
    > FROM data_table;
Query ID = ubuntu_20221019171238_5bf9fed8-9880-458d-8681-caa4880be317
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks not specified. Estimated from input data size: 23   
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1666027081523_0040, Tracking URL = <http://nn2:8088/proxy/application_1666027081523_0040/>
Kill Command = /usr/local/hadoop/bin/mapred job  -kill job_1666027081523_0040
Hadoop job information for Stage-1: number of mappers: 22; number of reducers: 23
2022-10-19 17:12:48,179 Stage-1 map = 0%,  reduce = 0%
2022-10-19 17:13:48,800 Stage-1 map = 0%,  reduce = 0%, Cumulative CPU 59.09 sec
2022-10-19 17:14:49,231 Stage-1 map = 0%,  reduce = 0%, Cumulative CPU 26.52 sec
2022-10-19 17:15:49,589 Stage-1 map = 0%,  reduce = 0%, Cumulative CPU 22.23 sec
2022-10-19 17:16:37,701 Stage-1 map = 100%,  reduce = 100%
java.io.IOException: java.net.ConnectException: Your endpoint configuration is wrong; For more details see:  <http://wiki.apache.org/hadoop/UnsetHostnameOrPort>
        at org.apache.hadoop.mapred.ClientServiceDelegate.invoke(ClientServiceDelegate.java:345)
        at org.apache.hadoop.mapred.ClientServiceDelegate.getJobStatus(ClientServiceDelegate.java:430)
        at org.apache.hadoop.mapred.YARNRunner.getJobStatus(YARNRunner.java:872)
        at org.apache.hadoop.mapreduce.Cluster.getJob(Cluster.java:215)    
        at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:602)    
        at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:600)    
        at java.security.AccessController.doPrivileged(Native Method)      
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1762)
        at org.apache.hadoop.mapred.JobClient.getJobUsingCluster(JobClient.java:600)
        at org.apache.hadoop.mapred.JobClient.getJobInner(JobClient.java:610)
        at org.apache.hadoop.mapred.JobClient.getJob(JobClient.java:640)   
        at org.apache.hadoop.hive.ql.exec.mr.HadoopJobExecHelper.progress(HadoopJobExecHelper.java:295)
        at org.apache.hadoop.hive.ql.exec.mr.HadoopJobExecHelper.progress(HadoopJobExecHelper.java:559)
        at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:433)
        at org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:149)
        at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:205)  
        at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:97)
        at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:2664)   
        at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:2335)      
        at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:2011)  
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1709)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1703)
        at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:157)
        at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:218)
        at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:239)
        at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:188)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:402)
        at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:821)
        at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)    
        at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:683)   
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)     
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:323)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:236)
Caused by: java.net.ConnectException: Your endpoint configuration is wrong; For more details see:  <http://wiki.apache.org/hadoop/UnsetHostnameOrPort>  
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:423) 
        at org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:842)
        at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:757) 
        at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1583)   
        at org.apache.hadoop.ipc.Client.call(Client.java:1525)
        at org.apache.hadoop.ipc.Client.call(Client.java:1422)
        at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:231)
        at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:118)
        at com.sun.proxy.$Proxy88.getJobReport(Unknown Source)
        at org.apache.hadoop.mapreduce.v2.api.impl.pb.client.MRClientProtocolPBClientImpl.getJobReport(MRClientProtocolPBClientImpl.java:133)
        at sun.reflect.GeneratedMethodAccessor27.invoke(Unknown Source)    
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.mapred.ClientServiceDelegate.invoke(ClientServiceDelegate.java:326)
        ... 36 more
Caused by: java.net.ConnectException: Connection refused
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)        
        at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:716)
        at org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:205)
        at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:535)       
        at org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:701)
        at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:822)
        at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:414)
        at org.apache.hadoop.ipc.Client.getConnection(Client.java:1653)    
        at org.apache.hadoop.ipc.Client.call(Client.java:1469)
        ... 45 more
Ended Job = job_1666027081523_0040 with exception 'java.io.IOException(java.net.ConnectException: Your endpoint configuration is wrong; For more details see:  <http://wiki.apache.org/hadoop/UnsetHostnameOrPort>)'
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask. java.net.ConnectException: Your endpoint configuration is wrong; For more details see:  <http://wiki.apache.org/hadoop/UnsetHostnameOrPort>

시도 과정

메모리 할당량 조절

#mapred-site.xml
<property>
	<name>mapreduce.map.memory.mb</name> #맵 컨테이너를 생성할 때 설정하는 메모리
	<value>16384</value>
</property>
<property>
	<name>mapreduce.reduce.memory.mb</name> #리듀스 컨테이너를 생성할 때 설정하는 메모리
	<value>8124</value>
</property>
	'''~~맵 컨테이너 메모리(mapreduce.map.memory.mb)의 2배로 설정하는 것이 일반적~~
-> 저희 팀의 경우 map처리 부분에서 지속적으로 문제가 발생했는데 
그 이유가 mapper의 메모리 부족이라고 판단했습니다.
'''

<property>
	<name>mapreduce.map.java.opts.max.heap</name>#맵 컨테이너를 생성할 때 설정하는 자바 옵션
	<value>4096</value>
</property>
'''
Xmx 옵션을 이용하여 힙사이즈를 설정
맵 컨테이너 메모리(mapreduce.map.momory.mb)의 80%로 설정
'''

#Yarn-site.xml
<property>  
		<name>yarn.Scheduler.minimun-allocation-md</name>#하나의 컨테이너에 할당할 수 있는 최소 메모리 값
		<value>8192</value>#1G가 기본값 -> 8g
<property>
<property>
		<name>yarn.Scheduler.maximun-allocation-md</name>#하나의 컨테이너에 할당할 수 있는 최대 메모리 값
		<value>15000</value>#8G가 기본 값 -> 15g
<property>

2**.리듀서 조절**

#hive HQ
set hive.exec.reducers.bytes.per.reducer = 256000000; #256MB #128000000
-- reducer 당 메모리 크기를 설정한다.설정된 메모리 크기를 바탕으로 reducer 개수를 정의한다.
-- 6G -> 15개 리듀서가 적당

-- 최대 리듀서 사용개수
set hive.exec.reducers.max = 128;

-- 리듀서 사용 개수 지정
set mapreduce.job.reduces=3

#hive HQ
set hive.exec.reducers.bytes.per.reducer = 256000000
set hive.exec.reducers.max = 100
set mapreduce.job.reduces=100

컨테이너 설정

해결

공식 홈페이지, 구글링, 해외 블로그를 샅샅히 살피면서 크게 위 3가지 방법을 찾게 되었다.

하지만 위 방법으로는 해결되지 않았고, 멘토님께서 문제 상황을 들어보시고는 메모리 할당이 제대로 이루어지고 있는지 확인해볼 필요성을 말씀하셨다.

결국, 근본적인 문제라고 생각했던 인스턴스 성능 그 자체를 향상 시키는 방법, 해당 인스턴스를 t3.small → t3.Xlarge 변경했고 다시 시도했을 때 문제가 자연스럽게 해결되었다.

다행히 문제를 해결할 수 있었지만, 문제 해결이 쉽게 이루어진 것에 비해 그 과정에서 너무 많은 시간과 에너지를 소모했다.

단순히 다양한 방법을 시도하는 것 보다 시도했을 때 왜 작동이 안되는지? 왜 효과가 없는지 하나씩 깊게 살펴보았더라면 좀 더 빠르게 문제 해결 과정에 도달할 수 있지 않을까 생각했다.